In [28]:
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score


In [8]:
#load data

train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('dev.csv')

In [9]:
train_data

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...
...,...,...,...,...,...,...,...
250869,358950,14671,349,5.0,0,2014-02-08,Made a reservation for an early dinner Saturda...
250870,358951,3356,349,5.0,0,2014-02-07,"Emily is like Franny's Marco, but with warmth ..."
250871,358953,116424,349,5.0,0,2014-01-31,Can't say enough good things about this place....
250872,358954,161147,349,5.0,0,2014-01-30,"Had a great dinner here- fantastic pizza, the ..."


In [15]:
train_data[:22]

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...
5,5,928,0,4.0,1,2009-09-02,A solid 4 stars for this greek food spot. If ...
6,7,930,0,4.0,1,2007-05-20,Love this place! Try the Chicken sandwich or ...
7,8,931,0,4.0,1,2005-12-27,My friend and I were intrigued by the nightly ...
8,10,933,0,5.0,1,2014-01-21,pretty cool place...good food...good people
9,12,935,0,5.0,1,2011-01-31,Fabulous Authentic Greek Food!!! This little s...


In [16]:
val_data

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,11,934,0,5.0,1,2014-01-20,"all around good place, cozy, I came in and did..."
1,17,940,0,4.0,0,2014-09-16,"For lunch, my friend and I had: -Lamb sandwich..."
2,20,943,0,5.0,0,2014-05-24,Some good Big Greek cooking!! Came to City on ...
3,30,953,0,4.0,0,2013-10-17,So... as you may notice from some of my other ...
4,43,966,0,3.0,0,2012-12-19,"I don't understand the whole ""You can't order ..."
...,...,...,...,...,...,...,...
35913,358855,161115,349,3.0,0,2014-12-15,"Okay, so I gotta repeat the chorus here and st..."
35914,358859,161116,349,2.0,0,2014-12-10,"The pizza is delicious, but it's SO loud here,..."
35915,358884,161125,349,5.0,0,2014-09-05,Emily has hands down the best pizza I've had i...
35916,358894,1423,349,5.0,0,2014-07-13,I'm not entirely sure who came up with the nam...


In [11]:
train_data.describe()

Unnamed: 0,ex_id,user_id,prod_id,rating,label
count,250874.0,250874.0,250874.0,250874.0,250874.0
mean,179413.488815,53970.730446,459.778211,4.023717,0.102916
std,103640.791253,45803.665418,259.854178,1.056995,0.30385
min,0.0,923.0,0.0,1.0,0.0
25%,89526.5,13820.0,247.0,4.0,0.0
50%,179345.5,40485.5,468.0,4.0,0.0
75%,269196.75,87298.0,672.0,5.0,0.0
max,358956.0,161147.0,922.0,5.0,1.0


In [19]:
train_data.groupby('user_id').filter(lambda x: x['user_id'].count()>1)

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...
6,7,930,0,4.0,1,2007-05-20,Love this place! Try the Chicken sandwich or ...
7,8,931,0,4.0,1,2005-12-27,My friend and I were intrigued by the nightly ...
...,...,...,...,...,...,...,...
250865,358946,102178,349,5.0,0,2014-02-11,"Great Brooklyn vibe, excellent pizza, and a de..."
250867,358948,21152,349,5.0,0,2014-02-10,Believe the hype. This place is true and real....
250869,358950,14671,349,5.0,0,2014-02-08,Made a reservation for an early dinner Saturda...
250870,358951,3356,349,5.0,0,2014-02-07,"Emily is like Franny's Marco, but with warmth ..."


Split data for Fake Reviews & True Reviews to collect common words betweeen both to use as stop words

In [21]:
notfake = train_data[train_data['label']== 1]
fake = train_data[train_data['label'] == 0]
true_word_list = []

In [23]:
for i in range(0,notfake['review'].count()):
    true_word_temp = notfake['review'].iloc[i].split()
    true_word_list.append(true_word_temp)
from itertools import chain
list1 = list(chain.from_iterable(true_word_list))
from collections import Counter 
true_mc = Counter(list1).most_common()
 
fake_word_list= fake['review'].str.split(expand=True).stack()
from collections import Counter 
fake_mc = Counter(fake_word_list).most_common()

collects top 20 common words -- > stored in df_common

In [24]:
df_true = pd.DataFrame(true_mc)
df_false = pd.DataFrame(fake_mc)
df_false1 = df_false[0:23]
df_true1 = df_true[0:23]
df_common = list(set(df_true1.iloc[:,0]) & set(df_false1.iloc[:,0]))
df_common

['we',
 'the',
 'but',
 'had',
 'The',
 'a',
 'for',
 'you',
 'of',
 'this',
 'with',
 'and',
 'was',
 'that',
 'is',
 'on',
 'my',
 'it',
 'to',
 'I',
 'in']

Using Count Vectorizer only on review column of train data set to generate features also using ngram = 3 and removing stop words (df_common)

In [27]:
vectorizer = CountVectorizer(stop_words= df_common, ngram_range=(2,3), max_features= 15000)
vectorizer.fit(train_data.review)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=15000, min_df=1,
                ngram_range=(2, 3), preprocessor=None,
                stop_words=['we', 'the', 'but', 'had', 'The', 'a', 'for', 'you',
                            'of', 'this', 'with', 'and', 'was', 'that', 'is',
                            'on', 'my', 'it', 'to', 'I', 'in'],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Transforming text values to numeric using above vectorizer on train and test dataset

In [30]:
x_train = vectorizer.transform(train_data.review)
x_test  = vectorizer.transform(val_data.review)

Using SMOTE to handle imbalanced dataset

In [33]:
x_train

<250874x15000 sparse matrix of type '<class 'numpy.int64'>'
	with 6872922 stored elements in Compressed Sparse Row format>

In [36]:
dee_LR = LogisticRegression()
dee_LR.fit(x_train, train_data['label'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

model evaluation (copied from Amelia's notebook)

In [29]:
def ClassifierMetrics (X_train, Y_train, X_test, Y_test, fitted_model):
    Y_pred = fitted_model.predict(X_test)
    metrics = {'train_accuracy': fitted_model.score(X_train, Y_train),
    'test_accuracy': fitted_model.score(X_test, Y_test),
    'test_auc': roc_auc_score(Y_test, Y_pred),
    'test_ap': average_precision_score(Y_test, Y_pred)}
    return metrics

In [38]:
dee_logistic = ClassifierMetrics (x_train, train_data['label'], x_test, val_data['label'], dee_LR)

In [39]:
dee_logistic

{'train_accuracy': 0.900364326315202,
 'test_accuracy': 0.8909182025725263,
 'test_auc': 0.5088242314707593,
 'test_ap': 0.1050700310260274}