In [26]:
import pandas as pd
import pickle

### Loading Data

In [27]:
train_path = '../../data/raw/train.csv'
val_path = '../../data/raw/dev.csv'

train = pd.read_csv(train_path)
validation = pd.read_csv(val_path)

### Balancing Dataset
**Methodology:**

1. Separate out the negative examples (dominant class)
2. Determine the number of dataframes (`num_splits`) needed to incorporate all negative examples.
3. Create a list of dataframes containing the different splits of negative examples.
4. Concat the positive and negative examples back together.
    - For each new training set, include a 80% random sample of the positive examples to aviod overfitting to the
    postive examples

In [4]:
# Setting frac = 1 to shuffle all the data
full_negative_examples = train[train['label']==0].sample(frac=1)

In [5]:
# Obtaining the number of positive and negative examples 
# to determine the number of splits  
num_pos_examples = positive_examples.count()[0]
num_neg_examples = full_negative_examples.count()[0]

num_splits = int(round(num_neg_examples / num_pos_examples))

In [59]:
neg_train_data = [full_negative_examples[ i * num_pos_examples : min((i + 1) * num_pos_examples, num_neg_examples)] for i in range(num_splits)]

In [61]:
training_sets = []
for negative_examples in neg_train_data:
    positive_examples = train[train['label']==1].sample(frac=.8)
    # Unioning the positive and negative examples 
    # Then shuffling so that not all negative examples are at the end
    train_set = pd.concat([negative_examples, positive_examples], ignore_index=True).sample(frac=1)
    training_sets.append(train_set)

### Extracting Features

In [38]:
train = training_sets[1]

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cnt_vectorizer = CountVectorizer(stop_words='english', binary=True)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', binary=True)
cnt_vectorizer.fit(train['review'])
tfidf_vectorizer.fit(train['review'])

cnt_X_train = cnt_vectorizer.transform(train['review'])
tfidf_X_train = tfidf_vectorizer.transform(train['review'])

cnt_X_dev = cnt_vectorizer.transform(validation['review'])
tfidf_X_dev = tfidf_vectorizer.transform(validation['review'])

Y_train = train['label']
Y_dev = validation['label']

In [62]:
base = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/data/processed/dev/'
def load_obj(fname,  base=base):
    # This loads the pickled object.
    with open(base + fname + '.pkl', 'rb') as f:
        return pickle.load(f)

def save_obj(obj, fname,  base=base):
    # This writes out a python object as a pickle.
    with open(base + fname + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

for i, train in enumerate(training_sets):
    cnt_train_fname = 'ac4119_cnt_X_train_set_{0}'.format(i)
    tfidf_train_fname = 'ac4119_tfidf_X_train_set_{0}'.format(i)
    
    train_labels_fname = 'ac4119_train_set_{0}_labels.csv'.format(i)
    dev_labels_fname = 'ac4119_dev_set_{0}_labels.csv'.format(i)
    
    cnt_dev_fname = 'ac4119_cnt_X_dev_set_{0}'.format(i)
    tfidf_dev_fname = 'ac4119_tfidf_X_dev_set_{0}'.format(i)

    cnt_vectorizer = CountVectorizer(stop_words='english', binary=True)
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', binary=True)
    cnt_vectorizer.fit(train['review'])
    tfidf_vectorizer.fit(train['review'])

    cnt_X_train = cnt_vectorizer.transform(train['review'])
    tfidf_X_train = tfidf_vectorizer.transform(train['review'])

    cnt_X_dev = cnt_vectorizer.transform(validation['review'])
    tfidf_X_dev = tfidf_vectorizer.transform(validation['review'])

    Y_train = train['label']
    Y_dev = validation['label']
    
    save_obj(cnt_X_train, cnt_train_fname)
    save_obj(cnt_X_dev, cnt_dev_fname)
    save_obj(tfidf_X_train, tfidf_train_fname)
    save_obj(tfidf_X_dev, tfidf_dev_fname)
    Y_train.to_csv(base + train_labels_fname, index=False, sep=',')
    Y_dev.to_csv(base + dev_labels_fname, index=False, sep=',')

Test Cases
- Fully Unbalanced
- Each of the CV pairs

Notes:
Failed to converge at default max iter and at 200 when lbfgs specified as solver

In [64]:
from sklearn.linear_model import LogisticRegression

In [65]:
params = {'solver':'liblinear', 'max_iter':1000, 'class_weight': 'balanced', 'random_state': 519}

In [66]:
tfidf_lr = LogisticRegression(**params )
# tfidf_lr = LogisticRegression()
fitted_tfidf_lr = tfidf_lr.fit(tfidf_X_train, Y_train)

In [67]:
cnt_lr = LogisticRegression(**params )
fitted_cnt_lr = cnt_lr .fit(cnt_X_train, Y_train)

### Model Evaluation

In [68]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, roc_curve

In [69]:
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
    'test_accuracy': fitted_model.score(X_test, Y_test),
    'test_tpr': tpr,
    'test_fpr': fpr,
    'test_auc': roc_auc_score(Y_test, Y_pred),
    'test_ap': average_precision_score(Y_test, Y_pred)}
    return metrics

In [70]:
tfidf_metrics = ClassifierMetrics(tfidf_X_train, Y_train, tfidf_X_dev, Y_dev, fitted_tfidf_lr)
cnt_metrics = ClassifierMetrics(cnt_X_train, Y_train, cnt_X_dev, Y_dev, fitted_cnt_lr)

In [71]:
cnt_metrics

{'test_accuracy': 0.6027340052341444,
 'test_ap': 0.13950157985017012,
 'test_auc': 0.6336395744649042,
 'test_fpr': array([0.       , 0.4051441, 1.       ]),
 'test_tpr': array([0.        , 0.67242325, 1.        ]),
 'train_accuracy': 0.8538924890804762}

In [72]:
tfidf_metrics

{'test_accuracy': 0.6568572860404254,
 'test_ap': 0.1503918669377533,
 'test_auc': 0.6537919245646655,
 'test_fpr': array([0.        , 0.34236133, 1.        ]),
 'test_tpr': array([0.        , 0.64994518, 1.        ]),
 'train_accuracy': 0.7566302206742984}