In [None]:
# Import python packages and library that will be used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read the clean dataset which is saved by the MSc_project code
data_clean = pd.read_csv("/content/drive/MyDrive/Msc project/dataset_clean.csv")
data_clean.head()

Unnamed: 0,text,category,label
0,I have bought several of the Vitality canned d...,unhelpful,0
1,This is a confection that has been around a fe...,unhelpful,0
2,If you are looking for the secret ingredient i...,helpful,1
3,Right now I'm mostly just sprouting this so my...,unhelpful,0
4,I don't know if it's the cactus or the tequila...,unhelpful,0


In [None]:
# Calculate percent of "helpful" v. "unhelpful" reviews
data_clean.label.value_counts(normalize=True)

0    0.582282
1    0.417718
Name: label, dtype: float64

In [None]:
# split the dataset for 90% train, 10% test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data_clean.index.values,
    data_clean.label.values,
    test_size=0.10,
    random_state=42,
    stratify=data_clean.label.values,    
)

In [None]:
data_clean['data_type'] = ['not_set']*data_clean.shape[0]
data_clean.loc[X_train, 'data_type'] = 'train'
data_clean.loc[X_test, 'data_type'] = 'test'

In [None]:
data_clean.groupby(['category', 'label','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
helpful,1,test,12465
helpful,1,train,112183
unhelpful,0,test,17376
unhelpful,0,train,156378


In [None]:
# Train-val-test split
from sklearn.model_selection import train_test_split

y = data_clean['label']
X = data_clean['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_len,
                                                    random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                test_size=test_len,
                                                random_state=123)

In [None]:
X_test

85185     I didn't know quite what to think about this d...
197937    Simply put this very delightful tea can be con...
208257    I love blueberries, both fresh and dried, but ...
135850    I was not sure if this would leave my food sme...
184659    If you want a truly healthy, filling snack, yo...
                                ...                        
40175     I was looking to change my dog's canned food a...
9176      I was concerned that this would be pasty tasti...
136460    No complaints on this one at all - very mild a...
45049     We purchased this coffee yesterday for the fir...
115722    Since Amazon apparently censors any mention of...
Name: text, Length: 29840, dtype: object

In [None]:
type(y_test)

pandas.core.series.Series

In [None]:
df_train = data_clean.loc[data_clean["data_type"]=="train"]
df_test = data_clean.loc[data_clean["data_type"]=="test"]

In [None]:
X_train = df_train['text'].values.tolist()
y_train = df_train['label'].values.tolist()

In [None]:
X_test = df_test['text'].values.tolist()
y_test = df_test['label'].values.tolist()

In [None]:
# Set up stopwords to be removed
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

stop_list = stopwords.words('english')
stop_list += list(string.punctuation)
stop_list += ['br', '.<', '..', '...', '``', "''", '--']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Random Forest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipe_bigram_rf = Pipeline([('vectorizer', CountVectorizer(analyzer='word', 
                                                          ngram_range=(2, 2), 
                                                          stop_words=stop_list, 
                                                          max_features=100)),
                        ('forest', RandomForestClassifier(n_estimators=100,
                                                          n_jobs=-1))])

pipe_bigram_rf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(max_features=100, ngram_range=(2, 2),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('forest', RandomForestClassifier(n_jobs=-1))])

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_rf_train = pipe_bigram_rf.predict(X_test)
print('test accuracy:', accuracy_score(y_test,y_rf_train))

test accuracy: 0.6137193793773668


In [None]:
# SVM
from sklearn.svm import SVC

pipe_bigram_svm = Pipeline([('vectorizer', CountVectorizer(analyzer='word', 
                                                          ngram_range=(2, 2), 
                                                          stop_words=stop_list, 
                                                          max_features=100)),
                          #  ('Scaler', StandardScaler()),
                           ('Support Vector Machine', SVC())])

pipe_bigram_svm.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(max_features=100, ngram_range=(2, 2),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('Support Vector Machine', SVC())])

In [None]:
y_svm_test = pipe_bigram_svm.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, y_svm_test))

Test accuracy: 0.6110385040715793


In [None]:
# Bayes
from sklearn import naive_bayes
pipe_bigram_bayes = Pipeline([('vectorizer', CountVectorizer(analyzer='word', 
                                                          ngram_range=(2, 2), 
                                                          stop_words=stop_list, 
                                                          max_features=100)),
                           ('Polynomial Parsimonious Bayes', naive_bayes.MultinomialNB())])

pipe_bigram_bayes.fit(X_train, y_train)

In [None]:
y_bayes_test = pipe_bigram_bayes.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, y_bayes_test))

Test accuracy: 0.5884186186789987
