#### Libraries & UDFs

In [1]:
from ttictoc import Timer
import pickle
import json
from ast import literal_eval

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

In [2]:
base = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/data/processed/dev/'

def load_obj(fname,  base=base):
    # This loads the pickled object.
    with open(base + fname + '.pkl', 'rb') as f:
        return pickle.load(f)


def writeJsonFile(fname, data,  base=base):
    with open(base + fname +'.json', 'w') as outfile:
        json.dump(data, outfile)
    print('Successfully written to {}'.format(fname))
    
def readJsonFile(fname, base=base):
    with open(base + fname + '.json', 'r') as f:
        data = json.load(f)
    return data

In [3]:
def identity_tokenizer(tokens):
    return tokens

In [4]:
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    Y_score = fitted_model.decision_function(X_test)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
               'test_accuracy': fitted_model.score(X_test, Y_test),
               'test_auc_pred': roc_auc_score(Y_test, Y_pred),
               'test_auc_score': roc_auc_score(Y_test, Y_score),
               'test_ap_pred': average_precision_score(Y_test, Y_pred),
               'test_ap_score': average_precision_score(Y_test, Y_score)}
    return metrics

#### Reading in Data

##### Lookup Tables

In [5]:
num_reviews_by_user = pd.read_csv(base + 'num_reviews_by_user.csv')
num_reviews_by_prod = pd.read_csv(base + 'num_reviews_by_prod.csv')

##### Construct Dev Set

In [6]:
dev_fname = '../../data/processed/dev/ac4119_dev_w_tokens.csv'
dev = pd.read_csv(dev_fname)

dev['token_review'] = dev['token_review'].apply(lambda x: literal_eval(x))
# Rationale: 
# At train, you only have visibility to the training numbers to train your model
# However, at dev/test you will have the cumulative numbers as INPUT ONLY.
# We cannot use the cumulative number(s) to generate our model. 
# But, realistic to use them as input during test/dev
dev_num_reviews_by_user = num_reviews_by_user[['user_id','cumulative_total_train_dev_test_reviews']]
dev_num_reviews_by_user.columns = ['user_id','num_user_reviews']

In [7]:
dev_num_reviews_by_prod = num_reviews_by_prod[['prod_id','cumulative_total_train_dev_test_reviews']]
dev_num_reviews_by_prod.columns = ['prod_id','num_prod_reviews']

In [8]:
dev = pd.merge(dev, dev_num_reviews_by_user , on='user_id', how='left')
dev = pd.merge(dev, dev_num_reviews_by_prod , on='prod_id', how='left')

In [9]:
def getTrainSet(i):
    train_fname = '../../data/processed/dev/ac4119_train_set_{0}_w_tokens.csv'.format(i)
    train = pd.read_csv(train_fname)
    train['token_review'] = train['token_review'].apply(lambda x: literal_eval(x))
    
    train_num_reviews_by_user = num_reviews_by_user[['user_id','train_num_reviews']]
    train_num_reviews_by_user.columns = ['user_id','num_user_reviews']
    
    train_num_reviews_by_prod = num_reviews_by_prod[['prod_id','train_num_reviews']]
    train_num_reviews_by_prod.columns = ['prod_id','num_prod_reviews']
    
    train = pd.merge(train, train_num_reviews_by_user , on='user_id', how='left')
    train = pd.merge(train, train_num_reviews_by_prod , on='prod_id', how='left')
    return train

#### Setting Pipeline

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

In [11]:
feature_cols = ['rating', 'token_review', 'num_user_reviews', 'num_prod_reviews']
X_dev = dev[feature_cols].fillna(0)
Y_dev = dev['label']

In [12]:
def trainModel(params, X_train, Y_train):
    # Defining tfidf params
    tfidf_vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, decode_error='ignore',
                                           stop_words='english', 
                                           lowercase=False, binary=True, 
                                           min_df=0.01)


    # setting remainder to passthrough so that the remaining columns (i.e. rating) get included as-is 
    pipeline = Pipeline([
        ('transformer',  make_column_transformer((StandardScaler(), ['num_user_reviews', 'num_prod_reviews']),
                                                (tfidf_vectorizer, 'token_review'),
                                                remainder = 'passthrough')),
        ('fitted_svm', SGDClassifier(**params)),
    ])

    fitted_model = pipeline.fit(X_train, Y_train)
    return fitted_model

### Grid Search

In [13]:
all_attempts = []

In [14]:
sets = [1,3]
losses = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
alphas = [0.00001, 0.000001, 0.1, 1, 10]
sgd_params_combos = [(a,l,i) for a in alphas for l in losses for i in sets]

In [15]:
for p in sgd_params_combos:
    a,l,i = p
    train = getTrainSet(i)
    
    feature_cols = ['rating', 'token_review', 'num_user_reviews', 'num_prod_reviews']
    X_train = train[feature_cols].fillna(0)
    Y_train = train['label']
    
    # Defining model params
    params = {'alpha': a,
      'class_weight': 'balanced',
      'loss': l,
      'penalty': 'l2',
      'random_state': 519}
    
    fitted_model = trainModel(params, X_train, Y_train)
    metrics = ClassifierMetrics(X_train, Y_train, X_dev, Y_dev, fitted_model)
    model_attempt_details = {'params': params, 'metrics': metrics}

    all_attempts.append(model_attempt_details)

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


In [16]:
# File name of the model attempts/results
fname = 'sgd_attempts_ac4119_202005119b'
writeJsonFile(fname, all_attempts)

Successfully written to sgd_attempts_ac4119_202005119b


In [20]:
all_attempts

[{'test_accuracy': 0.7536054346010357,
  'test_ap_pred': 0.17080233911090023,
  'test_ap_score': 0.23832333162772845,
  'test_auc_pred': 0.6731094360766341,
  'test_auc_score': 0.7591436138475255,
  'train_accuracy': 0.6969918664199337},
 {'test_accuracy': 0.7132078623531377,
  'test_ap_pred': 0.17235727048091248,
  'test_ap_score': 0.24665313890511958,
  'test_auc_pred': 0.6881914996275941,
  'test_auc_score': 0.7601918808681141,
  'train_accuracy': 0.7032319146189268},
 {'test_accuracy': 0.6531544072609834,
  'test_ap_pred': 0.1677031606244063,
  'test_ap_score': 0.23594724337006595,
  'test_auc_pred': 0.6931855805457244,
  'test_auc_score': 0.7570887971012129,
  'train_accuracy': 0.6947540560313293},
 {'test_accuracy': 0.8046383428921432,
  'test_ap_pred': 0.16282474902893246,
  'test_ap_score': 0.23901825466984344,
  'test_auc_pred': 0.6406052923795388,
  'test_auc_score': 0.7561975454498503,
  'train_accuracy': 0.6822309248181779},
 {'test_accuracy': 0.38754941811904897,
  'test_a

### Apply to test set

In [None]:
test_fname = '../../data/processed/dev/ac4119_test_set_w_tokens.csv'
test = pd.read_csv(test_fname)
test_num_reviews_by_prod = num_reviews_by_prod[['prod_id','cumulative_total_train_dev_test_reviews']]
test_num_reviews_by_prod.columns = ['prod_id','num_prod_reviews']

test_num_reviews_by_user = num_reviews_by_user[['user_id','cumulative_total_train_dev_test_reviews']]
test_num_reviews_by_user.columns = ['user_id','num_user_reviews']

test = pd.merge(test, test_num_reviews_by_user , on='user_id', how='left')
test = pd.merge(test, test_num_reviews_by_prod , on='prod_id', how='left')

In [None]:
test['token_review'] = test['token_review'].apply(lambda x: literal_eval(x))

feature_cols = ['rating', 'token_review', 'num_user_reviews', 'num_prod_reviews']
X_test = test[feature_cols].fillna(0)

Y_test = test['label']

In [None]:
Y_pred = fitted_model.predict(X_test)
Y_score = fitted_model.decision_function(X_test)

In [None]:
len(Y_score)

In [None]:
test['Y_score'] = Y_score 

In [None]:
test.head(5)

In [None]:
predictions = test[['ex_id', 'Y_score']].sort_values(by='ex_id', ascending=True)

In [None]:
predictions.head(5)

In [None]:
!pwd

In [None]:
fname = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/predictions.csv'
predictions['Y_score'].to_csv(fname, header=False, index=False)