## Step 1: Import all packages

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame
from collections import OrderedDict 
import warnings

warnings.filterwarnings("ignore")

## Step 2: Read in labelled dataset obtained from iteration1

In [2]:
df = pd.read_csv('./fake reviews dataset.csv')

## Step 3: Preprocess the review text

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    
    # lower case
    sentence = sentence.lower()
    
    # remove special characters
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    
    # tokenization
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    
    # remove stopwords
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    #filtered_words = [w for w in tokens if len(w) > 2]
    
    # stemming and lemmatization
    stem_words=[stemmer.stem(w) for w in filtered_words]
    #stem_words = filtered_words
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    
    return " ".join(lemma_words)


# clean text
df['cleanText']=df['text_'].map(lambda s:preprocess(s)) 

## Step 4: Convert the string label to numerical label

In [4]:
def create_y(row):
    if row['label'] == 'CG':
        return 1
    else:
        return 0

df['new_label'] = df.apply(lambda row: create_y(row), axis=1)
df['cleanText'] = df['cleanText'].astype('str')
categories = ['Home_and_Kitchen_5', 'Electronics_5', 'Sports_and_Outdoors_5', 
              'Clothing_Shoes_and_Jewelry_5', 'Movies_and_TV_5']

## Step 5: Train a model for each category and print out the performance of models

In [6]:
transformers = []
models = []

# depth = 1, n_est = 60
for category in categories:
    
    # take each category data
    y = df[df['category']  == category]['new_label']
    x = df[df['category']  == category]['cleanText']
    x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.20)
    
    # perform tfidf transformation
    tfidf_vect = TfidfVectorizer()
    tfidf_train = tfidf_vect.fit_transform(x_train)
    tfidf_test = tfidf_vect.transform(x_test)
    
    # model training
    Adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=70, learning_rate=0.3, random_state=1)
    Adab.fit(tfidf_train, y_train)
    y_pred3 = Adab.predict(tfidf_test)
    
    
    # print the performance of the model
    print(category)
    acc_score = metrics.accuracy_score(y_test,y_pred3)
    auc_score = metrics.roc_auc_score(y_test,y_pred3)
    precision = metrics.precision_score(y_test, y_pred3)
    print('Accuracy is: ' + str(round(acc_score,2)))
    print('AUC score is: ' + str(round(auc_score,2)))
    print('Precision is: ' + str(round(precision,2)))
    print(' ')
    
    
    # make a transformer that fits on all data
    tfidf_transformer = TfidfVectorizer().fit(x)
    tfidf_all = tfidf_transformer.transform(x)
    y_all = y
    
    # make a model that is trained on all data
    Adab_final = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=70, learning_rate=0.3, random_state=1)
    Adab_final.fit(tfidf_all, y_all)
    
    # append to list
    transformers.append(tfidf_transformer)
    models.append(Adab_final)

Home_and_Kitchen_5
Accuracy is: 0.82
AUC score is: 0.82
Precision is: 0.83
 
Electronics_5
Accuracy is: 0.78
AUC score is: 0.77
Precision is: 0.77
 
Sports_and_Outdoors_5
Accuracy is: 0.8
AUC score is: 0.8
Precision is: 0.79
 
Clothing_Shoes_and_Jewelry_5
Accuracy is: 0.83
AUC score is: 0.83
Precision is: 0.86
 
Movies_and_TV_5
Accuracy is: 0.81
AUC score is: 0.81
Precision is: 0.83
 


## Step 6: Save the Models and TfidfVectorizertrs

In [6]:
import pickle

# Save the Modle to file in the current working directory

categories = ['Kitchens', 'Electronics', 'Sports', 'Cloths', 'Movies']

for i in range(5):
    
    Pkl_Filename = './models/' + categories[i] + ".pkl"  
    
    with open(Pkl_Filename, 'wb') as file:  
        pickle.dump(models[i], file)


In [7]:
for i in range(5):
    
    Pkl_Filename = './transformers/' + categories[i] + ".pickle"  
    
    with open(Pkl_Filename, 'wb') as file:  
        pickle.dump(transformers[i], file)