In [1]:
from ttictoc import Timer
import pickle
import json
from ast import literal_eval

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
base = '/Users/chuamelia/Google Drive/Spring 2020/Machine Learning/fake-review-detection-project/data/processed/dev/'

def load_obj(fname,  base=base):
    # This loads the pickled object.
    with open(base + fname + '.pkl', 'rb') as f:
        return pickle.load(f)


def writeJsonFile(fname, data,  base=base):
    with open(base + fname +'.json', 'w') as outfile:
        json.dump(data, outfile)
    print('Successfully written to {}'.format(fname))
    
def readJsonFile(fname, base=base):
    with open(base + fname + '.json', 'r') as f:
        data = json.load(f)
    return data

In [3]:
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    Y_score = fitted_model.decision_function(X_test)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
               'test_accuracy': fitted_model.score(X_test, Y_test),
               'test_auc_pred': roc_auc_score(Y_test, Y_pred),
               'test_auc_score': roc_auc_score(Y_test, Y_score),
               'test_ap_pred': average_precision_score(Y_test, Y_pred),
               'test_ap_score': average_precision_score(Y_test, Y_score)}
    return metrics

In [4]:
def fitted_lsvm(params, X_train, Y_train):
    svm = LinearSVC(**params)
    fitted_svm = svm.fit(X_train, Y_train)
    return fitted_svm


def fitted_lsvm_sgd(params, X_train, Y_train):
    svm = SGDClassifier(**params)
    fitted_svm = svm.fit(X_train, Y_train)
    return fitted_svm

In [5]:
def identity_tokenizer(tokens):
    return tokens

def tfidf_lsvm_metrics(params, dev_name, train_num):
    i = train_num
    train_fname = '../../data/processed/dev/ac4119_train_set_{0}_w_tokens.csv'.format(i)
    dev_fname = '../../data/processed/dev/{0}.csv'.format(dev_name)

    train = pd.read_csv(train_fname)
    dev = pd.read_csv(dev_fname)

    Y_train = train['label']
    Y_dev = dev['label']

    tfidf_vectorizer_fname = 'ac4119_X_train_set_{0}_tfidf_vectorizer'.format(i)
    tfidf_vectorizer = load_obj(tfidf_vectorizer_fname)

    train['token_review'] = train['token_review'].apply(lambda x: literal_eval(x))
    dev['token_review'] = dev['token_review'].apply(lambda x: literal_eval(x))

    X_train = tfidf_vectorizer.transform(train['token_review'])
    X_dev = tfidf_vectorizer.transform(dev['token_review'])

    fitted_model = fitted_lsvm(params, X_train ,Y_train )
    metrics = ClassifierMetrics(X_train, Y_train, X_dev, Y_dev, fitted_model)
    return metrics 

In [10]:
all_attempts = []

In [None]:
cs = [0.01, 0.001]
losses = ['hinge', 'squared_hinge']
penalty = ['l2', 'l1']
lsvm_params_combos = [(c,l,p) for c in cs for l in losses for p in penalty]

for train_num in range(8):
    other_train_sets = ['ac4119_train_set_{0}_w_tokens'.format(i) for i in range(8) if i != train_num]
    dev_sets = ['ac4119_dev_w_tokens'] + other_train_sets
    
    for dev_name in dev_sets: 

        params = {'random_state': 519, 
                  'C': c, 
                  'loss': loss, 
                  'penalty': penalty, 
                  'class_weight':'balanced'}
        t = Timer()
        t.start()

        metrics = tfidf_lsvm_metrics(params, dev_name, train_num)
        elapsed = t.stop()

        print('Elapsed time:',elapsed)

        model_attempt_details = {'elapsed_time': elapsed,
                                 'train_set':train_num,
                                 'dev_set': dev_name,
                                 'vectorizer':'tfidf',
                                 'model': 'LinearSVC', 
                                 'params': params, 'metrics': metrics}

        all_attempts.append(model_attempt_details)

In [15]:
# File name of the model attempts/results
fname = 'all_attempts_ac4119_20200516'
writeJsonFile(fname, all_attempts)

Successfully written to all_attempts_ac4119
