In [1]:
import pandas as pd
import pickle

### Loading Data

In [2]:
train_path = '../../data/raw/train.csv'
val_path = '../../data/raw/dev.csv'

train = pd.read_csv(train_path)
validation = pd.read_csv(val_path)

### Balancing Dataset
**Methodology:**
    - 

In [3]:
positive_examples = train[train['label']==1].sample(frac=.8)

In [4]:
# Setting frac = 1 to shuffle all the data
full_negative_examples = train[train['label']==0].sample(frac=1)

In [5]:
num_pos_examples = positive_examples.count()[0]
num_neg_examples = full_negative_examples.count()[0]

In [6]:
num_splits = int(round(num_neg_examples / num_pos_examples))

In [7]:
neg_train_data = [full_negative_examples[ i * num_pos_examples : min((i + 1) * num_pos_examples, num_pos_examples)] for i in range(num_splits)]

In [None]:
training_sets = []
for negative_examples in neg_train_data:
    positive_examples = train[train['label']==1].sample(frac=.8)
    train_set = negative_examples.union(positive_examples).sample(frac=1)
    training_sets.append(train_set)

### Extracting Features

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cnt_vectorizer = CountVectorizer(stop_words='english', binary=True)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', binary=True)
cnt_vectorizer.fit(train['review'])
tfidf_vectorizer.fit(train['review'])

cnt_X_train = cnt_vectorizer.transform(train['review'])
tfidf_X_train = tfidf_vectorizer.transform(train['review'])

cnt_X_dev = cnt_vectorizer.transform(validation['review'])
tfidf_X_dev = tfidf_vectorizer.transform(validation['review'])

Y_train = train['label']
Y_dev = validation['label']

Test Cases
- Fully Unbalanced
- Each of the CV pairs

Notes:
Failed to converge at default max iter and at 200 when lbfgs specified as solver

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
params = {'solver':'liblinear', 'max_iter':1000, 'class_weight': 'balanced', 'random_state': 519}

In [11]:
tfidf_lr = LogisticRegression(**params )
# tfidf_lr = LogisticRegression()
fitted_tfidf_lr = tfidf_lr.fit(tfidf_X_train, Y_train)

In [12]:
cnt_lr = LogisticRegression(**params )
fitted_cnt_lr = cnt_lr .fit(cnt_X_train, Y_train)

### Model Evaluation

In [13]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, roc_curve

In [15]:
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
    'test_accuracy': fitted_model.score(X_test, Y_test),
    'test_tpr': tpr,
    'test_fpr': fpr,
    'test_auc': roc_auc_score(Y_test, Y_pred),
    'test_ap': average_precision_score(Y_test, Y_pred)}
    return metrics

In [16]:
tfidf_metrics = ClassifierMetrics(tfidf_X_train, Y_train, tfidf_X_dev, Y_dev, fitted_tfidf_lr)
cnt_metrics = ClassifierMetrics(cnt_X_train, Y_train, cnt_X_dev, Y_dev, fitted_cnt_lr)

In [17]:
cnt_metrics

{'test_accuracy': 0.8923659446517067,
 'test_ap': 0.10739606117677855,
 'test_auc': 0.512061276088812,
 'test_fpr': array([0.        , 0.01069104, 1.        ]),
 'test_tpr': array([0.       , 0.0348136, 1.       ]),
 'train_accuracy': 0.9092652088299306}

In [18]:
tfidf_metrics

{'test_accuracy': 0.897794977448633,
 'test_ap': 0.10501753192530831,
 'test_auc': 0.5048710187208802,
 'test_fpr': array([0.        , 0.00204524, 1.        ]),
 'test_tpr': array([0.        , 0.01178728, 1.        ]),
 'train_accuracy': 0.8978690498018926}