In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import time

In [3]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt  
import gensim

pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%matplotlib inline

In [4]:
X_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_test.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_test.csv')
X_val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/X_val.csv')
y_val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/y_val.csv')

In [5]:
X_train = X_train.iloc[:,0]
X_test = X_test.iloc[:,0]
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0]
X_val = X_val.iloc[:,0]
y_val = y_val.iloc[:,0]
X_train

0        worse tea fresh green tea indeed green tea time produced poor storing condition sealed enviroment exposed extensively air make tea lost flavor dont know old tea experience least 35 years old loose...
1        ive tasted best real things wonderful dreadfully high fat carbs calories includes dressings dips sauces yes ive added goes salad oil fat content calories soar expect fatfree substitute anything cl...
2        love taste good ginger snap cookie saw offered jumped thinking easy review would wellas taste great true ginger snap flavor opened bag smell made mouth water cookie extremely hard really hard time...
3        absolutely love coachs oatmeal hated oatmeal till tasted coach oatmeal first 2 bags bought costconow costco doesnt carry anymoreso went line find coach oatmeal amazon carry happy found themi oatme...
4        normally dont go instant coffees delicious instructions suggest 4 teaspoons 68oz hot water used 5 teaspoons 12oz water dollop silk soy creamer made delicio

In [6]:
df = pd.DataFrame(data=[X_train, y_train], index=["text", "label"]).T
df = df.append(pd.DataFrame(data=[X_val, y_val], index=["text", "label"]).T)
df = df.append(pd.DataFrame(data=[X_test, y_test], index=["text", "label"]).T)

In [7]:
model_w2v = gensim.models.Word2Vec.load("/content/drive/MyDrive/Colab Notebooks/word2vec.model")

In [8]:
model_w2v.wv.most_similar(positive="tea")

[('teas', 0.7136606574058533),
 ('chai', 0.604446291923523),
 ('hrefhttpwwwamazoncomgpproductb0012bzghsstash', 0.5814846158027649),
 ('higgins', 0.5766370892524719),
 ('hrefhttpwwwamazoncomgpproductb000f4h5qitwinings', 0.5731595158576965),
 ('lipton', 0.5673545598983765),
 ('rejuvination', 0.5584450364112854),
 ('herbal', 0.5564500689506531),
 ('burke', 0.556394100189209),
 ('asinb000cqe3nm', 0.5519647598266602)]

In [9]:
model_w2v.wv.most_similar(positive="terrible")

[('piecebatchbag', 0.5067934393882751),
 ('horrible', 0.5022017359733582),
 ('bad', 0.4886583387851715),
 ('corpoate', 0.46941494941711426),
 ('facialno', 0.4491904377937317),
 ('hypomag', 0.4458191692829132),
 ('instaed', 0.4442039132118225),
 ('immitation', 0.443316251039505),
 ('sucralosejust', 0.4379229247570038),
 ('poor', 0.4314398169517517)]

In [10]:
df.iloc[0,:]

text     worse tea fresh green tea indeed green tea time produced poor storing condition sealed enviroment exposed extensively air make tea lost flavor dont know old tea experience least 35 years old loose...
label                                                                                                                                                                                                          0
Name: 0, dtype: object

In [11]:
tokenized_reviews = X_train.apply(lambda x: x.split()) # tokenizing 

In [12]:
tokenized_reviews.shape

(96063,)

In [13]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

In [14]:
wordvec_arrays = np.zeros((len(tokenized_reviews), 200)) 


In [15]:
for i in range(len(tokenized_reviews)):
    wordvec_arrays[i,:] = word_vector(tokenized_reviews[i], 200)

In [16]:
wordvec_arrays.shape

(96063, 200)

In [17]:
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(96063, 200)

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
model3 = GradientBoostingClassifier(verbose=1)
model3.fit(wordvec_df,y_train)

      Iter       Train Loss   Remaining Time 
         1           0.8763           13.39m
         2           0.8608           13.26m
         3           0.8478           13.12m
         4           0.8362           13.10m
         5           0.8248           12.92m
         6           0.8141           12.76m
         7           0.8029           12.58m
         8           0.7923           12.42m
         9           0.7836           12.37m
        10           0.7750           12.22m
        20           0.7016           10.80m
        30           0.6512            9.42m
        40           0.6146            8.09m
        50           0.5843            6.73m
        60           0.5602            5.38m
        70           0.5396            4.03m
        80           0.5228            2.69m
        90           0.5072            1.34m
       100           0.4942            0.00s


GradientBoostingClassifier(verbose=1)

In [19]:
tokenized_reviews_val = X_val.apply(lambda x: x.split()) # tokenizing 
wordvec_arrays_val = np.zeros((len(tokenized_reviews_val), 200)) 
for i in range(len(tokenized_reviews_val)):
    wordvec_arrays_val[i,:] = word_vector(tokenized_reviews_val[i], 200)
wordvec_df_val = pd.DataFrame(wordvec_arrays_val)
wordvec_df_val.shape    

(32021, 200)

In [20]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model3, wordvec_df_val, y_val, cv=5)
scores

      Iter       Train Loss   Remaining Time 
         1           0.8770            3.39m
         2           0.8592            3.33m
         3           0.8449            3.30m
         4           0.8313            3.25m
         5           0.8205            3.22m
         6           0.8087            3.17m
         7           0.7973            3.14m
         8           0.7871            3.10m
         9           0.7775            3.06m
        10           0.7690            3.03m
        20           0.6963            2.67m
        30           0.6421            2.33m
        40           0.6026            2.00m
        50           0.5704            1.66m
        60           0.5431            1.33m
        70           0.5203           59.89s
        80           0.5005           39.96s
        90           0.4833           20.00s
       100           0.4684            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.8780            3.28m
        

array([0.88961749, 0.89506558, 0.89225484, 0.89241099, 0.88897564])

In [21]:
tokenized_reviews_test = X_test.apply(lambda x: x.split()) # tokenizing 

In [22]:
wordvec_arrays_test = np.zeros((len(tokenized_reviews_test), 200)) 
for i in range(len(tokenized_reviews_test)):
    wordvec_arrays_test[i,:] = word_vector(tokenized_reviews_test[i], 200)

In [23]:
wordvec_df_test = pd.DataFrame(wordvec_arrays_test)
wordvec_df_test.shape

(32022, 200)

In [24]:
predictions = model3.predict(wordvec_df_test)

In [25]:
print(confusion_matrix(y_test,predictions))

[[ 2322  2969]
 [  375 26356]]


In [26]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.86      0.44      0.58      5291
           1       0.90      0.99      0.94     26731

    accuracy                           0.90     32022
   macro avg       0.88      0.71      0.76     32022
weighted avg       0.89      0.90      0.88     32022

