In [1]:
import ast

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#taken from Amelia!

def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    """
    X_train: training set features
    Y_train: training set labels
    X_test: dev/test set features
    Y_test: dev/test set labels
    fitted_model: 
    """
    Y_pred = fitted_model.predict(X_test)
    # If your classifier doesn't output decision_function, use predict_proba. 
    # Make sure it is taking the prob of the '1' class
    Y_score = fitted_model.predict_proba(X_test)[:,1]
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
               'test_accuracy': fitted_model.score(X_test, Y_test),
               'test_auc_pred': roc_auc_score(Y_test, Y_pred),
               'test_auc_score': roc_auc_score(Y_test, Y_score),
               'test_ap_pred': average_precision_score(Y_test, Y_pred),
               'test_ap_score': average_precision_score(Y_test, Y_score)}
    return metrics

## Loading Data

In [3]:
fake=pd.read_csv('fake_tokens.csv')
fake=fake.rename(columns={'Unnamed: 0':'index'})
real=pd.read_csv('real_tokens.csv')
real=real.rename(columns={'Unnamed: 0':'index'})
valid=pd.read_csv('valid_tokens.csv')

In [4]:
real['filtered_tokens']=real['filtered_tokens'].apply(lambda x: ast.literal_eval(x))
real['filtered_tokens']=real['filtered_tokens'].apply(lambda x: ' '.join(x).split())

fake['filtered_tokens']=fake['filtered_tokens'].apply(lambda x: ast.literal_eval(x))
fake['filtered_tokens']=fake['filtered_tokens'].apply(lambda x: ' '.join(x).split())

valid['filtered_tokens']=valid['filtered_tokens'].apply(lambda x: ast.literal_eval(x))
valid['filtered_tokens']=valid['filtered_tokens'].apply(lambda x: ' '.join(x).split())

val_x=valid.drop(['label'],axis=1)
val_y=valid['label']

In [5]:
full_df=real.append(fake,sort=False)
x=full_df.drop(['label'],axis=1)
y=full_df['label']

In [15]:
full_df

Unnamed: 0,index,ex_id,user_id,prod_id,rating,label,date,review,tokens,filtered_tokens
0,11,15,938,0,5.0,0,2014-10-10,I had the braised lamb sandwich and was one of...,"['I', 'had', 'the', 'braised', 'lamb', 'sandwi...","[braised, lamb, sandwich, best, sandwiches, li..."
1,12,16,939,0,3.0,0,2014-10-03,This spot is close to my job so I decided to c...,"['This', 'spot', 'is', 'close', 'to', 'my', 'j...","[spot, close, job, decided, check, lunch, pric..."
2,13,18,941,0,5.0,0,2014-08-10,"Needed a quick, healthy light ""lunch"" and this...","['Needed', 'a', 'quick', ',', 'healthy', 'ligh...","[needed, quick, healthy, light, lunch, place, ..."
3,14,19,942,0,4.0,0,2014-06-17,Their artichoke chicken salad is good and plenty.,"['Their', 'artichoke', 'chicken', 'salad', 'is...","[artichoke, chicken, salad, good, plenty]"
4,15,21,944,0,4.0,0,2014-05-21,Needed a quick bite and stopped here because o...,"['Needed', 'a', 'quick', 'bite', 'and', 'stopp...","[needed, quick, bite, stopped, reviews, really..."
5,16,26,949,0,4.0,0,2014-03-01,"Quick, delicious and filling. After a few hour...","['Quick', ',', 'delicious', 'and', 'filling', ...","[quick, delicious, filling, hours, shopping, s..."
6,17,27,950,0,4.0,0,2014-01-24,"Delicious, consistent, well-priced. Food taste...","['Delicious', ',', 'consistent', ',', 'well-pr...","[delicious, consistent, wellpriced, food, tast..."
7,18,28,951,0,5.0,0,2014-01-18,"Really quaint little ""hole in the wall"" kind o...","['Really', 'quaint', 'little', '``', 'hole', '...","[really, quaint, little, hole, wall, kind, sto..."
8,19,29,952,0,5.0,0,2014-01-16,Delicious lamb sandwich,"['Delicious', 'lamb', 'sandwich']","[delicious, lamb, sandwich]"
9,20,32,955,0,2.0,0,2013-09-10,Been meaning to try this place for a while-hig...,"['Been', 'meaning', 'to', 'try', 'this', 'plac...","[meaning, try, place, whilehighly, recommended..."


In [6]:
num_reviews_user=full_df[['user_id','index']]
num_reviews_user=num_reviews_user.groupby('user_id').count()
num_reviews_user=num_reviews_user.reset_index()
num_reviews_user=num_reviews_user.rename(columns={'index':'train_num_reviews_u'})
num_reviews_user2=val_x[['user_id','review']]
num_reviews_user2=num_reviews_user2.groupby('user_id').count()
num_reviews_user2=num_reviews_user2.reset_index()
num_reviews_user2=num_reviews_user2.rename(columns={'review':'val_num_reviews_u'})

In [7]:
reviews_user = pd.merge(num_reviews_user, num_reviews_user2, on='user_id', how='outer')
reviews_user['val_num_reviews_u']=reviews_user['val_num_reviews_u'].fillna(0)
reviews_user['train_num_reviews_u']=reviews_user['train_num_reviews_u'].fillna(0)
reviews_user['cumul_num_reviews_u']=reviews_user['train_num_reviews_u']+reviews_user['val_num_reviews_u']

In [8]:
num_reviews_prod=full_df[['prod_id','index']]
num_reviews_prod=num_reviews_prod.groupby('prod_id').count()
num_reviews_prod=num_reviews_prod.reset_index()
num_reviews_prod=num_reviews_prod.rename(columns={'index':'train_num_reviews_p'})
num_reviews_prod2=val_x[['prod_id','review']]
num_reviews_prod2=num_reviews_prod2.groupby('prod_id').count()
num_reviews_prod2=num_reviews_prod2.reset_index()
num_reviews_prod2=num_reviews_prod2.rename(columns={'review':'val_num_reviews_p'})

In [9]:
reviews_prod = pd.merge(num_reviews_prod, num_reviews_prod2, on='prod_id', how='outer')
reviews_prod['val_num_reviews_p']=reviews_prod['val_num_reviews_p'].fillna(0)
reviews_prod['train_num_reviews_p']=reviews_prod['train_num_reviews_p'].fillna(0)
reviews_prod['cumul_num_reviews_p']=reviews_prod['train_num_reviews_p']+reviews_prod['val_num_reviews_p']

In [10]:
reviews_user_train=reviews_user[['user_id','train_num_reviews_u']]
reviews_prod_train=reviews_prod[['prod_id','train_num_reviews_p']]

reviews_user_val=reviews_user[['user_id','cumul_num_reviews_u']]
reviews_prod_val=reviews_prod[['prod_id','cumul_num_reviews_p']]

In [11]:
train = pd.merge(full_df, reviews_user_train, on='user_id', how='left')
train = pd.merge(train, reviews_prod_train, on='prod_id', how='left')

val = pd.merge(valid, reviews_user_val , on='user_id', how='left')
val = pd.merge(val, reviews_prod_val , on='prod_id', how='left')

In [12]:
#random undersampling
train_0=train[train['label']==0]
train_1=train[train['label']==1]
train_0= train_0.sample(len(train_1))
train = pd.concat([train_0, train_1], axis=0)

In [22]:
train=train.rename(columns={'train_num_reviews_u':'num_reviews_u','train_num_reviews_p':'num_reviews_p'})
val=val.rename(columns={'cumul_num_reviews_u':'num_reviews_u','cumul_num_reviews_p':'num_reviews_p'})

In [33]:
val['filtered_tokens']=val['filtered_tokens'].apply(lambda x: " ".join(i for i in x))
train['filtered_tokens']=train['filtered_tokens'].apply(lambda x: " ".join(i for i in x))

In [30]:
val['filtered_tokens'][0]

['around',
 'good',
 'place',
 'cozy',
 'came',
 'nt',
 'huge',
 'appetite',
 'stuck',
 'appetizers',
 'friend',
 'combo',
 'platter',
 'full']

In [34]:
feature_cols = ['rating', 'filtered_tokens', 'num_reviews_u', 'num_reviews_p']
train_x=train[feature_cols]
train_y=train['label']
val_x=val[feature_cols]
val_y=val['label']

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

bow_vectorizer = CountVectorizer(stop_words='english', binary=True)


params = {'max_depth':20,'n_estimators': 100,
  'min_samples_split':0.056, 
  'random_state': 519}

In [37]:
pipeline = Pipeline([
    ('transformer',  make_column_transformer((StandardScaler(), ['num_reviews_u', 'num_reviews_p']),
                                            (bow_vectorizer, 'filtered_tokens'),
                                            remainder = 'passthrough')),
    ('fitted_randomforest', RandomForestClassifier(**params)),
])

fitted_model = pipeline.fit(train_x, train_y)

In [38]:
metrics = ClassifierMetrics(train_x, train_y, val_x, val_y, fitted_model)

In [39]:
metrics

{'train_accuracy': 0.6749486812037647,
 'test_accuracy': 0.635419566791024,
 'test_auc_pred': 0.6650806619314011,
 'test_auc_score': 0.7228826412900473,
 'test_ap_pred': 0.15372049580975386,
 'test_ap_score': 0.21893688958109522}