In [60]:
import pandas as pd
import pickle

### Loading Data

In [61]:
train_path = '../../data/raw/train.csv'
val_path = '../../data/raw/dev.csv'

train = pd.read_csv(train_path)
validation = pd.read_csv(val_path)

### Balancing Dataset
**Methodology:**
    - 

In [41]:
positive_examples = train[train['label']==1].sample(frac=.8)

In [13]:
# Setting frac = 1 to shuffle all the data
negative_examples = train[train['label']==0].sample(frac=1)

In [18]:
num_pos_examples = positive_examples.count()[0]
num_neg_examples = negative_examples.count()[0]

In [31]:
num_splits = int(round(num_neg_examples / num_pos_examples))

In [84]:
neg_train_data = [negative_examples[ i * num_pos_examples : min((i + 1) * num_pos_examples, num_pos_examples)] for i in range(num_splits)]

In [44]:
neg_train_data = [negative_examples[ i * num_pos_examples : min((i + 1) * num_pos_examples, num_pos_examples)] for i in range(num_splits)]

(0, 20655)
(20655, 41310)
(41310, 61965)
(61965, 82620)
(82620, 103275)
(103275, 123930)
(123930, 144585)
(144585, 165240)
(165240, 185895)
(185895, 206550)
206550


In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cnt_vectorizer = CountVectorizer(stop_words='english', binary=True)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', binary=True)
cnt_vectorizer.fit(train['review'])
tfidf_vectorizer.fit(train['review'])

cnt_X_train = cnt_vectorizer.transform(train['review'])
tfidf_X_train = tfidf_vectorizer.transform(train['review'])

cnt_X_dev = cnt_vectorizer.transform(validation['review'])
tfidf_X_dev = tfidf_vectorizer.transform(validation['review'])

Y_train = train['label']
Y_dev = validation['label']

Test Cases
- Fully Unbalanced
- Each of the CV pairs

Notes:
Failed to converge at default max iter and at 200 when lbfgs specified as solver

In [47]:
from sklearn.linear_model import LogisticRegression

In [80]:
params = {'solver':'lbfgs', 'max_iter':200, 'random_state': 519}

In [81]:
tfidf_lr = LogisticRegression(**params )
# tfidf_lr = LogisticRegression()
fitted_tfidf_lr = tfidf_lr.fit(tfidf_X_train, Y_train)



In [77]:
tfidf_lr = LogisticRegression(**params )
# tfidf_lr = LogisticRegression()
fitted_tfidf_lr = tfidf_lr.fit(tfidf_X_train, Y_train)

In [65]:
fitted_tfidf_lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
cnt_lr = LogisticRegression()
fitted_cnt_lr = cnt_lr .fit(cnt_X_train, Y_train)



### Model Evaluation

In [54]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

In [67]:
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
    'test_accuracy': fitted_model.score(X_test, Y_test),
    'test_auc': roc_auc_score(Y_test, Y_pred),
    'test_ap': average_precision_score(Y_test, Y_pred)}
    return metrics

In [70]:
tfidf_metrics = ClassifierMetrics (tfidf_X_train, Y_train, tfidf_X_dev, Y_dev, fitted_tfidf_lr)
cnt_metrics = ClassifierMetrics (cnt_X_train, Y_train, cnt_X_dev, Y_dev, fitted_cnt_lr)

In [71]:
cnt_metrics

{'test_accuracy': 0.8923659446517067,
 'test_ap': 0.10739606117677855,
 'test_auc': 0.512061276088812,
 'train_accuracy': 0.9092652088299306}

In [72]:
tfidf_metrics

{'test_accuracy': 0.897794977448633,
 'test_ap': 0.10501753192530831,
 'test_auc': 0.5048710187208802,
 'train_accuracy': 0.8978690498018926}