## Setup

In [317]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plot
from wordcloud import WordCloud as WC
from wordcloud import STOPWORDS
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, roc_curve


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabriellahurtado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Data

In [318]:
fake=pd.read_csv('fake_tokens.csv')
fake=fake.rename(columns={'Unnamed: 0':'index'})
real=pd.read_csv('real_tokens.csv')
real=real.rename(columns={'Unnamed: 0':'index'})
full_df=real.append(fake,sort=False)
full_df['filtered_tokens']=full_df['filtered_tokens'].apply(lambda x: x.replace(' ',""))

In [342]:
valid=pd.read_csv('dev.csv')
val_x=valid.drop(['label'],axis=1)
val_y=valid['label']

In [321]:
x=full_df.drop(['label'],axis=1)
y=full_df['label']

## Featurizing

### Bag of words model

In [359]:
bow_vectorizer = CountVectorizer(stop_words='english', binary=True)
bow_vectorizer.fit(full_df['review'])
bow_x_train = bow_vectorizer.transform(x['review'])

### Bag of Bi-grams model

In [373]:
bv2 = CountVectorizer(stop_words='english', binary=True, ngram_range=(2,2))
bv2.fit(full_df['review'])
bv2_x_train = bv2.transform(x['review'])

### TF-IDF model

In [388]:
tv = TfidfVectorizer()
tv.fit(full_df['review'])
tv_x_train = tv.transform(x['review'])

## Balancing 

In [208]:
print('number of real reviews:',(len(real)))
print('number of fake reviews:',(len(fake)))

number of real reviews: 225055
number of fake reviews: 25819


In [320]:
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

### Undersampling

In [361]:
#BOW
rus=RandomUnderSampler()
under_x, under_y = rus.fit_sample(bow_x_train, y)

In [375]:
#Bi-grams
under_x_bv2, under_y_bv2=rus.fit_sample(bv2_x_train, y)

In [389]:
#TF-IDF
under_x_tv, under_y_tv=rus.fit_sample(tv_x_train, y)

### SMOTE

In [363]:
#BOW
sm=SMOTE()
over_x, over_y = sm.fit_sample(bow_x_train, y)

In [376]:
#Bi-grams
over_x_bv2, over_y_bv2 = sm.fit_sample(bv2_x_train, y)

In [390]:
#TF-IDF
over_x_tv, over_y_tv = sm.fit_sample(tv_x_train, y)

## Evaluation

In [347]:
#taken from Amelia!
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
    'test_accuracy': fitted_model.score(X_test, Y_test),
    'test_tpr': tpr,
    'test_fpr': fpr,
    'test_auc': roc_auc_score(Y_test, Y_pred),
    'test_ap': average_precision_score(Y_test, Y_pred)}
    return metrics

### Bag of Words Evaluation

In [365]:
bow_x_val=bow_vectorizer.transform(valid['review'])

#### BOW unbalanced

In [400]:
#unbalanced
lr=LogisticRegression()
bow_lr_train = lr.fit(bow_x_train,y)
eval_bow_lr=ClassifierMetrics(bow_x_train, y, bow_x_val, val_y, bow_lr_train)

eval_bow_lr

{'train_accuracy': 0.9092691948946483,
 'test_accuracy': 0.8923659446517067,
 'test_tpr': array([0.       , 0.0348136, 1.       ]),
 'test_fpr': array([0.        , 0.01069104, 1.        ]),
 'test_auc': 0.512061276088812,
 'test_ap': 0.10739606117677855}

#### BOW undersampled

In [369]:
#undersampled
bow_lr_under=lr.fit(under_x,under_y)
eval_bow_lr_under=ClassifierMetrics(under_x, under_y, bow_x_val, val_y, bow_lr_under)

eval_bow_lr_under

{'train_accuracy': 0.8340756807002595,
 'test_accuracy': 0.5961913246840025,
 'test_tpr': array([0.        , 0.69490132, 1.        ]),
 'test_fpr': array([0.        , 0.41496746, 1.        ]),
 'test_auc': 0.6399669268752141,
 'test_ap': 0.14159724719651312}

#### BOW oversampled

In [371]:
#oversampled
bow_lr_over=lr.fit(over_x,over_y)
eval_bow_lr_over=ClassifierMetrics(over_x, over_y, bow_x_val, val_y, bow_lr_over)

eval_bow_lr_over

{'train_accuracy': 0.8563617782319878,
 'test_accuracy': 0.7220613619912022,
 'test_tpr': array([0.        , 0.49890351, 1.        ]),
 'test_fpr': array([0.       , 0.2527115, 1.       ]),
 'test_auc': 0.623096006012863,
 'test_ap': 0.1419217109399919}

### Bag of Bi-grams Evaluation

In [379]:
bv2_x_val=bv2.transform(valid['review'])

#### Bi-gram unbalanced

In [382]:
#unbalanced
bv2_lr_train = lr.fit(bv2_x_train,y)
eval_bv2_lr=ClassifierMetrics(bv2_x_train, y, bv2_x_val, val_y, bv2_lr_train)

eval_bv2_lr

{'train_accuracy': 0.9862520627884914,
 'test_accuracy': 0.894955175677933,
 'test_tpr': array([0.        , 0.02796053, 1.        ]),
 'test_fpr': array([0.       , 0.0070344, 1.       ]),
 'test_auc': 0.5104630645213901,
 'test_ap': 0.10739348634448456}

#### Bi-gram undersampled

In [392]:
#undersampled
bv2_lr_under=lr.fit(under_x_bv2,under_y_bv2)
eval_bv2_lr_under=ClassifierMetrics(under_x_bv2, under_y_bv2, 
                                    bv2_x_val, val_y, bv2_lr_under)

eval_bv2_lr_under

{'train_accuracy': 0.9916727991014369,
 'test_accuracy': 0.570271173227908,
 'test_tpr': array([0.        , 0.70038377, 1.        ]),
 'test_fpr': array([0.        , 0.44443756, 1.        ]),
 'test_auc': 0.6279731069131613,
 'test_ap': 0.13633564692448208}

#### Bi-gram oversampled

In [387]:
#oversampled
bv2_lr_over=lr.fit(over_x_bv2,over_y_bv2)
eval_bv2_lr_over=ClassifierMetrics(over_x_bv2, over_y_bv2, bv2_x_val, val_y, bv2_lr_over)

eval_bv2_lr_over

{'train_accuracy': 0.9871675812579147,
 'test_accuracy': 0.7892421627039368,
 'test_tpr': array([0.        , 0.32648026, 1.        ]),
 'test_fpr': array([0.        , 0.15844438, 1.        ]),
 'test_auc': 0.5840179437884299,
 'test_ap': 0.13008696701999206}

### TF-IDF Evaluation

In [393]:
tv_x_val=tv.transform(valid['review'])

#### TF-IDF unbalanced

In [394]:
#unbalanced
tv_lr_train = lr.fit(tv_x_train,y)
eval_tv_lr=ClassifierMetrics(tv_x_train, y, tv_x_val, val_y, tv_lr_train)
eval_tv_lr

{'train_accuracy': 0.8979328268373765,
 'test_accuracy': 0.8974608831226683,
 'test_tpr': array([0.        , 0.01069079, 1.        ]),
 'test_fpr': array([0.        , 0.00229315, 1.        ]),
 'test_auc': 0.504198818969876,
 'test_ap': 0.10416861003364984}

#### TF-IDF undersampled

In [395]:
#undersampled
tv_lr_under=lr.fit(under_x_tv,under_y_tv)
eval_tv_lr_under=ClassifierMetrics(under_x_tv, under_y_tv, 
                                    tv_x_val, val_y, tv_lr_under)

eval_tv_lr_under

{'train_accuracy': 0.7384290638676944,
 'test_accuracy': 0.6579152514059803,
 'test_tpr': array([0.        , 0.66694079, 1.        ]),
 'test_fpr': array([0.        , 0.34310505, 1.        ]),
 'test_auc': 0.6619178691713012,
 'test_ap': 0.15398010054387182}

#### TF-IDF oversampled

In [396]:
#oversampled
tv_lr_over=lr.fit(over_x_tv,over_y_tv)
eval_tv_lr_over=ClassifierMetrics(over_x_tv, over_y_tv, tv_x_val, val_y, tv_lr_over)

eval_tv_lr_over

{'train_accuracy': 0.8223012152584924,
 'test_accuracy': 0.7639345175121109,
 'test_tpr': array([0.        , 0.44161184, 1.        ]),
 'test_fpr': array([0.        , 0.19962814, 1.        ]),
 'test_auc': 0.6209918522580856,
 'test_ap': 0.14505681552449534}

In [402]:
model_names=['Bag of Words (Unbalanced)','Bag of Words (Undersampled)',
             'Bag of Words (Oversampled)', 'Bi-grams (Unbalanced)',
             'Bi-grams (Undersampled)','Bi-grams (Oversampled)',
             'TF-IDF (Unbalanced)','TF-IDF (Undersampled)','TF-IDF (Oversampled)']
test_accs=[eval_bow_lr['test_accuracy'], eval_bow_lr_under['test_accuracy'], 
eval_bow_lr_over['test_accuracy'],eval_bv2_lr['test_accuracy'],
eval_bv2_lr_under['test_accuracy'],eval_bv2_lr_over['test_accuracy'],
eval_tv_lr['test_accuracy'],eval_tv_lr_under['test_accuracy'], 
eval_tv_lr_over['test_accuracy']]

test_tprs=[eval_bow_lr['test_tpr'], eval_bow_lr_under['test_tpr'], eval_bow_lr_over['test_tpr'],
eval_bv2_lr['test_tpr'],eval_bv2_lr_under['test_tpr'],eval_bv2_lr_over['test_tpr'],
eval_tv_lr['test_tpr'],eval_tv_lr_under['test_tpr'], eval_tv_lr_over['test_tpr']]

test_fprs=[eval_bow_lr['test_fpr'], eval_bow_lr_under['test_fpr'], eval_bow_lr_over['test_fpr'],
eval_bv2_lr['test_fpr'],eval_bv2_lr_under['test_fpr'],eval_bv2_lr_over['test_fpr'],
eval_tv_lr['test_fpr'],eval_tv_lr_under['test_fpr'], eval_tv_lr_over['test_fpr']]

test_aucs=[eval_bow_lr['test_auc'], eval_bow_lr_under['test_auc'], eval_bow_lr_over['test_auc'],
eval_bv2_lr['test_auc'],eval_bv2_lr_under['test_auc'],eval_bv2_lr_over['test_auc'],
eval_tv_lr['test_auc'],eval_tv_lr_under['test_auc'], eval_tv_lr_over['test_auc']]

test_aps=[eval_bow_lr['test_ap'], eval_bow_lr_under['test_ap'], eval_bow_lr_over['test_ap'],
eval_bv2_lr['test_ap'],eval_bv2_lr_under['test_ap'],eval_bv2_lr_over['test_ap'],
eval_tv_lr['test_ap'],eval_tv_lr_under['test_ap'], eval_tv_lr_over['test_ap']]

data={'Model':model_names,'Test Accuracy':test_accs,'Test TPR':test_tprs,
      'Test FPR':test_fprs,'Test AUC':test_aucs, 'Test AP':test_aps}
evals=pd.DataFrame(data)
evals

Unnamed: 0,Model,Test Accuracy,Test TPR,Test FPR,Test AUC,Test AP
0,Bag of Words (Unbalanced),0.892366,"[0.0, 0.03481359649122807, 1.0]","[0.0, 0.010691044313603966, 1.0]",0.512061,0.107396
1,Bag of Words (Undersampled),0.596191,"[0.0, 0.6949013157894737, 1.0]","[0.0, 0.41496746203904555, 1.0]",0.639967,0.141597
2,Bag of Words (Oversampled),0.722061,"[0.0, 0.49890350877192985, 1.0]","[0.0, 0.2527114967462039, 1.0]",0.623096,0.141922
3,Bi-grams (Unbalanced),0.894955,"[0.0, 0.027960526315789474, 1.0]","[0.0, 0.0070343972730089865, 1.0]",0.510463,0.107393
4,Bi-grams (Undersampled),0.570271,"[0.0, 0.7003837719298246, 1.0]","[0.0, 0.4444375581035017, 1.0]",0.627973,0.136336
5,Bi-grams (Oversampled),0.789242,"[0.0, 0.32648026315789475, 1.0]","[0.0, 0.15844437558103502, 1.0]",0.584018,0.130087
6,TF-IDF (Unbalanced),0.897461,"[0.0, 0.01069078947368421, 1.0]","[0.0, 0.002293151533932445, 1.0]",0.504199,0.104169
7,TF-IDF (Undersampled),0.657915,"[0.0, 0.6669407894736842, 1.0]","[0.0, 0.3431050511310815, 1.0]",0.661918,0.15398
8,TF-IDF (Oversampled),0.763935,"[0.0, 0.44161184210526316, 1.0]","[0.0, 0.19962813758909204, 1.0]",0.620992,0.145057
