In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame
from collections import OrderedDict 
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('./fake reviews dataset.csv')

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    
    # lower case
    sentence = sentence.lower()
    
    # remove special characters
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    
    # tokenization
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    
    # remove stopwords
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    #filtered_words = [w for w in tokens if len(w) > 2]
    
    # stemming and lemmatization
    #stem_words=[stemmer.stem(w) for w in filtered_words]
    stem_words = filtered_words
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    
    return " ".join(lemma_words)


# clean text
df['cleanText']=df['text_'].map(lambda s:preprocess(s)) 

In [4]:
def create_y(row):
    if row['label'] == 'CG':
        return 1
    else:
        return 0

df['new_label'] = df.apply(lambda row: create_y(row), axis=1)
df['cleanText'] = df['cleanText'].astype('str')
categories = ['Home_and_Kitchen_5', 'Electronics_5', 'Sports_and_Outdoors_5', 
              'Clothing_Shoes_and_Jewelry_5', 'Movies_and_TV_5']

In [5]:
transformers = []
models = []

# depth = 1, n_est = 60
for category in categories:
    
    # take each category data
    y = df[df['category']  == category]['new_label']
    x = df[df['category']  == category]['cleanText']
    x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.20)
    
    # perform tfidf transformation
    tfidf_vect = TfidfVectorizer()
    tfidf_train = tfidf_vect.fit_transform(x_train)
    tfidf_test = tfidf_vect.transform(x_test)
    
    # model training
    Adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=70, learning_rate=0.3, random_state=1)
    Adab.fit(tfidf_train, y_train)
    y_pred3 = Adab.predict(tfidf_test)
    
    
    # print the performance of the model
    print(category)
    acc_score = metrics.accuracy_score(y_test,y_pred3)
    auc_score = metrics.roc_auc_score(y_test,y_pred3)
    precision = metrics.precision_score(y_test, y_pred3)
    print(acc_score)
    print(auc_score)
    print(precision)
    print(' ')
    
    
    # make a transformer that fits on all data
    tfidf_transformer = TfidfVectorizer().fit(x)
    tfidf_all = tfidf_transformer.transform(x)
    y_all = y
    
    # make a model that is trained on all data
    Adab_final = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=70, learning_rate=0.3, random_state=1)
    Adab_final.fit(tfidf_all, y_all)
    
    # append to list
    transformers.append(tfidf_transformer)
    models.append(Adab_final)

Home_and_Kitchen_5
0.8214285714285714
0.8218717197274453
0.85
 
Electronics_5
0.7957393483709273
0.7961627381012213
0.8213333333333334
 
Sports_and_Outdoors_5
0.8227848101265823
0.8235690235690235
0.8514588859416445
 
Clothing_Shoes_and_Jewelry_5
0.8194805194805195
0.8212426099690839
0.856396866840731
 
Movies_and_TV_5
0.7799442896935933
0.7794812053432743
0.7777777777777778
 


In [59]:
import json

final_df_lst = []


categories = ['Home_and_Kitchen_2017', 'Electronics_2017', 'Sports_and_Outdoors_2017', 
              'Clothing_Shoes_and_Jewelry_2017', 'Movies_and_TV_2017']


for category in categories:
    
    idx = categories.index(category)
    
    path = './' + category + '.json'
    #file = open(path, 'r', encoding='utf-8')
    #reviews = []
    
    #for line in file.readlines():
    #    dic = json.loads(line)
    #    if dic['reviewTime'][6:11] == '2018':
    #        reviews.append(dic['reviewText'])
    
    # open json and take corresponding field
    reviews = []
    reviewTimes = []
    reviewRatings = []
    
    with open(path, 'r') as file:
        contents = json.loads(file.read())
    file.close()

    cnt = 0
    for element in contents:
        try:
            reviews.append(element['reviewText'])
            reviewTimes.append(element['reviewTime'])
            reviewRatings.append(element['overall'])
        except:
            cnt += 1
    
    #print(len(reviews))
    # convert to dataframe
    df_a = pd.DataFrame([reviews, reviewTimes, reviewRatings]).T
    df_a.columns = ['review_text', 'review_time', 'review_rating']
    df_a['category'] = category
    df_a['month'] = df_a['review_time'].str[:2].astype('int')
    
    # get count of each month
    #df_b = pd.DataFrame(df_a.groupby(['month'])['category'].count()).reset_index()
    #df_b.columns = ['month_', 'count']
    #df_a = pd.merge(df_a, df_b, how='left', left_on=['month'], right_on=['month_'])
    #df_a = df_a.drop(columns = ['month_'])
    
    
    # randomly sample 3000 rows and clean text
    #df_a = df_a.groupby("review_rating").sample(n=100, random_state=123, replace=True)
    df_a = df_a.sample(n = 2000, replace=False)
    
    # for each month take 1000 samples
    #df_a = df_a.groupby("month").sample(n=100, replace=False)
    
    
    df_a['cleanText']=df_a['review_text'].map(lambda s:preprocess(s)) 

    
    # transform the data and get prediction
    transformer_a = transformers[idx]
    x_a = df_a['cleanText']
    tfidf_a = transformer_a.transform(x_a)
    
    model_a = models[idx]
    pred_a = model_a.predict(tfidf_a)
    df_a['label'] = pred_a
    
    # see the distribution of fake review in each category
    dict_ = df_a['label'].value_counts().to_dict()
    percent = dict_[1] / (dict_[0] + dict_[1])
    print(category)
    print('{:.1%}'.format(percent))
    print(' ')
    
    final_df_lst.append(df_a)

Home_and_Kitchen_2017
14.0%
 
Electronics_2017
20.6%
 
Sports_and_Outdoors_2017
18.7%
 
Clothing_Shoes_and_Jewelry_2017
17.5%
 
Movies_and_TV_2017
27.2%
 


In [60]:
df_final = pd.concat(final_df_lst)

In [61]:
name_change = {'Clothing_Shoes_and_Jewelry_2017': 'Clothes',
               'Electronics_2017': 'Phones',
               'Home_and_Kitchen_2017': 'Kitchens',
               'Movies_and_TV_2017': 'Movies',
               'Sports_and_Outdoors_2017': 'Sports'}

In [62]:
df_final['category'] = df_final['category'].map(name_change)

## Creating dataset for plot 1

In [63]:
df1 = pd.DataFrame(df_final.groupby(['category']).apply(lambda x: x['label'].sum()/len(x)))

In [64]:
df1 = df1.reset_index()
df1.columns = ['category', 'fake_prob']

In [65]:
df1

Unnamed: 0,category,fake_prob
0,Clothes,0.1755
1,Kitchens,0.14
2,Movies,0.2715
3,Phones,0.206
4,Sports,0.187


In [66]:
df1.to_csv('./plot_data/cat_vs_prob.csv', index=False)

## Creating dataset for plot2

In [67]:
df2 = pd.DataFrame(df_final.groupby(['review_rating']).apply(lambda x: x['label'].sum()/len(x))).reset_index()

In [68]:
df2['review_rating'] = df2['review_rating'].astype(int)

In [69]:
df2.columns = ['review_rating', 'frac']

In [70]:
df2

Unnamed: 0,review_rating,frac
0,1,0.103486
1,2,0.145129
2,3,0.158621
3,4,0.222464
4,5,0.211616


In [71]:
df2.to_csv('./plot_data/rating_vs_prob.csv', index=False)