#### Libraries & UDFs

In [1]:
from ttictoc import Timer
import pickle
import json
from ast import literal_eval

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
base = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/data/processed/dev/'

def load_obj(fname,  base=base):
    # This loads the pickled object.
    with open(base + fname + '.pkl', 'rb') as f:
        return pickle.load(f)


def writeJsonFile(fname, data,  base=base):
    with open(base + fname +'.json', 'w') as outfile:
        json.dump(data, outfile)
    print('Successfully written to {}'.format(fname))
    
def readJsonFile(fname, base=base):
    with open(base + fname + '.json', 'r') as f:
        data = json.load(f)
    return data

In [3]:
def identity_tokenizer(tokens):
    return tokens

In [5]:
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    Y_score = fitted_model.decision_function(X_test)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
               'test_accuracy': fitted_model.score(X_test, Y_test),
               'test_auc_pred': roc_auc_score(Y_test, Y_pred),
               'test_auc_score': roc_auc_score(Y_test, Y_score),
               'test_ap_pred': average_precision_score(Y_test, Y_pred),
               'test_ap_score': average_precision_score(Y_test, Y_score)}
    return metrics

#### Reading in Data

In [6]:
i = 3
dev_name = 'ac4119_dev_w_tokens'

train_fname = '../../data/processed/dev/ac4119_train_set_{0}_w_tokens.csv'.format(i)
dev_fname = '../../data/processed/dev/{0}.csv'.format(dev_name)

train = pd.read_csv(train_fname)
dev = pd.read_csv(dev_fname)

Y_train = train['label']
Y_dev = dev['label']

tfidf_vectorizer_fname = 'ac4119_X_train_set_{0}_tfidf_vectorizer'.format(i)
tfidf_vectorizer = load_obj(tfidf_vectorizer_fname)

train['token_review'] = train['token_review'].apply(lambda x: literal_eval(x))
dev['token_review'] = dev['token_review'].apply(lambda x: literal_eval(x))

##### Joining Lookup Tables

In [7]:
num_reviews_by_user = pd.read_csv(base + 'num_reviews_by_user.csv')
num_reviews_by_prod = pd.read_csv(base + 'num_reviews_by_prod.csv')

In [8]:
# Rationale: 
# At train, you only have visibility to the training numbers to train your model
train_num_reviews_by_user = num_reviews_by_user[['user_id','train_num_reviews']]
train_num_reviews_by_user.columns = ['user_id','num_user_reviews']
# However, at dev/test you will have the cumulative numbers as INPUT ONLY.
# We cannot use the cumulative number(s) to generate our model. 
# But, realistic to use them as input during test/dev
dev_num_reviews_by_user = num_reviews_by_user[['user_id','cumulative_total_train_dev_test_reviews']]
dev_num_reviews_by_user.columns = ['user_id','num_user_reviews']

In [9]:
train_num_reviews_by_prod = num_reviews_by_prod[['prod_id','train_num_reviews']]
train_num_reviews_by_prod.columns = ['prod_id','num_prod_reviews']

dev_num_reviews_by_prod = num_reviews_by_prod[['prod_id','cumulative_total_train_dev_test_reviews']]
dev_num_reviews_by_prod.columns = ['prod_id','num_prod_reviews']

In [10]:
train = pd.merge(train, train_num_reviews_by_user , on='user_id', how='left')
train = pd.merge(train, train_num_reviews_by_prod , on='prod_id', how='left')

dev = pd.merge(dev, dev_num_reviews_by_user , on='user_id', how='left')
dev = pd.merge(dev, dev_num_reviews_by_prod , on='prod_id', how='left')

#### Setting Pipeline

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

feature_cols = ['rating', 'token_review', 'num_user_reviews', 'num_prod_reviews']
X_train = train[feature_cols].fillna(0)
X_dev = dev[feature_cols].fillna(0)

In [18]:
Y_train = train['label']
Y_dev = dev['label']

In [22]:
# Defining tfidf params
tfidf_vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, decode_error='ignore',
                                       stop_words='english', 
                                       lowercase=False, binary=True, 
                                       min_df=0.01)
# Defining model params

params = {'alpha': 0.0001,
  'class_weight': 'balanced',
  'loss': 'log',
  'penalty': 'l2',
  'random_state': 519}

In [27]:
# setting remainder to passthrough so that the remaining columns (i.e. rating) get included as-is 
pipeline = Pipeline([
    ('transformer',  make_column_transformer((StandardScaler(), ['num_user_reviews', 'num_prod_reviews']),
                                            (tfidf_vectorizer, 'token_review'),
                                            remainder = 'passthrough')),
    ('fitted_svm', SGDClassifier(**params)),
])

fitted_model = pipeline.fit(X_train, Y_train)

  'stop_words.' % sorted(inconsistent))


In [28]:
metrics = ClassifierMetrics(X_train, Y_train, X_dev, Y_dev, fitted_model)

In [29]:
metrics 

{'test_accuracy': 0.6988418063366557,
 'test_ap_pred': 0.17485633426942299,
 'test_ap_score': 0.24497013129895262,
 'test_auc_pred': 0.6969727226145626,
 'test_auc_score': 0.7632626849118458,
 'train_accuracy': 0.7032534320265095}