# Loading

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('df_stem.csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,review,sentiment,review_stem
0,0,0,one of the other reviewers has mentioned that ...,1,one review mention watch 1 oz episod youll hoo...
1,1,1,a wonderful little production the filming tech...,1,wonder littl product film techniqu unassum old...
2,2,2,i thought this was a wonderful way to spend ti...,1,thought wonder way spend time hot summer weeke...
3,3,3,basically theres a family where a little boy j...,0,basic there famili littl boy jake think there ...
4,4,4,petter matteis love in the time of money is a ...,1,petter mattei love time money visual stun film...


In [4]:
df = df[['review_stem', 'sentiment']]

In [5]:
df.head()

Unnamed: 0,review_stem,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


# Word 2 Vec

In [6]:
df['review_stem'] = df['review_stem'].apply(str.split)

In [7]:
df.head()

Unnamed: 0,review_stem,sentiment
0,"[one, review, mention, watch, 1, oz, episod, y...",1
1,"[wonder, littl, product, film, techniqu, unass...",1
2,"[thought, wonder, way, spend, time, hot, summe...",1
3,"[basic, there, famili, littl, boy, jake, think...",0
4,"[petter, mattei, love, time, money, visual, st...",1


In [8]:
model = Word2Vec(
    df['review_stem'],
    vector_size = 100,
    min_count = 1
)

In [9]:
len(model.wv.index_to_key)

181885

In [10]:
model.wv.most_similar('wonder')

[('amaz', 0.6838846206665039),
 ('great', 0.6593637466430664),
 ('fantast', 0.639372706413269),
 ('brilliant', 0.6155218482017517),
 ('excel', 0.6026454567909241),
 ('marvel', 0.5858359336853027),
 ('outstand', 0.5734038352966309),
 ('emotionallymov', 0.5700697302818298),
 ('fine', 0.5667689442634583),
 ('terrif', 0.5630647540092468)]

In [11]:
model.wv.most_similar('time')

[('twice', 0.5350856781005859),
 ('timeth', 0.5203762054443359),
 ('way', 0.5142894983291626),
 ('day', 0.5107340216636658),
 ('week', 0.5072611570358276),
 ('yourtim', 0.5014492273330688),
 ('timei', 0.5007034540176392),
 ('occas', 0.49366918206214905),
 ('viewingther', 0.4885726273059845),
 ('consciencey', 0.4807792901992798)]

## Sentence Averaging

In [12]:
def sentence_average(keys):
    vector_sum = np.array([0.0] * 100)
    
    for key in keys:
        vector_sum += np.array(model.wv.get_vector(key).tolist())
    
    return vector_sum / len(keys)

In [13]:
sentence_average(df['review_stem'][0])

array([-0.25114734,  0.56130495,  0.12622223, -0.09484987,  0.11403836,
       -0.85955142, -0.24019168,  0.36160183, -0.43221062, -0.24736694,
       -0.26967891, -0.88656184, -0.10944689,  0.59524837,  0.6140416 ,
       -0.26421048, -0.34738145,  0.22597556, -0.30315009, -0.3500781 ,
       -0.10531182,  0.28648132,  0.32011766, -0.00497991, -0.856269  ,
        0.39994858, -0.03192961, -0.56758975, -0.04820226,  0.23657853,
        0.17174474, -0.45998302,  0.36186432, -0.2337815 , -0.28648373,
        0.62403883,  0.1770442 , -0.05660974, -0.70498997, -0.45916587,
       -0.25734449, -0.40414656, -0.51474091,  0.40341388,  0.04077772,
        0.37760741, -0.05125772, -0.75305606,  0.63317299,  0.27578309,
        0.1275908 , -0.04617969,  0.32193216, -0.24863227, -0.17195917,
       -0.07118574,  0.34046771, -0.13161188, -0.59056102, -0.24107444,
       -0.02031403,  0.12239894,  0.20721275,  0.18943326, -0.25727393,
        0.23768727, -0.10195069,  0.23688781, -1.02580086,  0.17

## Vector Dataframe

In [14]:
dfv = pd.DataFrame()

In [15]:
dfv[[str(i) for i in range(100)]] = df.apply(lambda row: sentence_average(row[0]), axis = 1, result_type = "expand")

In [16]:
dfv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.251147,0.561305,0.126222,-0.09485,0.114038,-0.859551,-0.240192,0.361602,-0.432211,-0.247367,...,0.321737,0.545179,0.062861,0.241012,-0.00175,0.349589,0.236443,-0.480595,0.375254,-0.430951
1,-0.351663,0.121878,-0.120232,-0.553368,0.182102,-1.230392,0.201216,0.558062,-0.544723,-0.838765,...,0.239608,0.648976,0.118813,0.202644,-0.037746,0.6728,0.460015,-0.634437,0.277175,-0.146046
2,-0.552294,0.300187,-0.094526,-0.310396,-0.018994,-0.820702,0.3539,0.716587,-0.478776,-0.726901,...,0.316577,0.510552,0.079388,0.559151,-0.04721,0.35193,0.53341,-0.288712,0.183992,-0.352972
3,-0.233144,0.494998,0.041341,-0.21172,-0.121327,-0.502161,0.055911,0.394001,-0.394661,-0.845051,...,0.094764,0.917627,0.364555,0.159884,-0.202905,0.805818,0.485097,-0.742107,0.061896,-0.303398
4,-0.37638,0.266668,-0.131451,-0.05347,0.472519,-1.122471,0.230775,0.287217,-0.643272,-0.717788,...,0.239622,0.739376,0.068496,0.188084,0.096164,0.574591,0.168376,-0.36387,0.606393,-0.529876


In [17]:
dfv.shape

(50000, 100)

In [18]:
dfv.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,-0.435858,0.343747,-0.142955,-0.207073,0.039749,-0.851512,0.051781,0.386618,-0.494941,-0.714446,...,0.304288,0.528262,0.024425,0.311987,0.054328,0.568812,0.482529,-0.40334,0.209007,-0.406036
std,0.180976,0.203002,0.200563,0.187675,0.242441,0.21024,0.179665,0.154537,0.164958,0.238769,...,0.151729,0.205215,0.176769,0.185555,0.172128,0.178191,0.229812,0.185536,0.226916,0.205265
min,-1.558287,-2.673465,-1.49829,-1.577211,-1.884527,-1.856055,-1.056473,-0.593067,-1.704187,-2.420685,...,-0.421505,-1.107965,-0.822074,-1.032606,-0.903163,-0.199434,-0.778366,-1.59299,-0.970814,-1.865047
25%,-0.548517,0.21842,-0.268547,-0.325665,-0.113126,-0.989004,-0.063694,0.288857,-0.597188,-0.860015,...,0.206638,0.396973,-0.09249,0.194224,-0.05182,0.447617,0.330823,-0.516836,0.063356,-0.533866
50%,-0.429457,0.347007,-0.134644,-0.202466,0.052184,-0.849065,0.046926,0.385756,-0.492325,-0.69213,...,0.298201,0.528515,0.01645,0.311907,0.061412,0.557183,0.479539,-0.39954,0.217504,-0.398765
75%,-0.316336,0.474143,-0.007037,-0.084294,0.205335,-0.712237,0.162431,0.482914,-0.389817,-0.547208,...,0.397148,0.659299,0.132321,0.429176,0.167243,0.679777,0.631069,-0.284919,0.363578,-0.271518
max,0.48149,1.324433,0.799927,0.670352,1.315728,0.204049,1.154638,1.271893,0.41009,0.398451,...,1.275397,1.705258,1.187145,1.601814,0.983742,1.554354,1.977695,0.761206,1.158904,1.022808


# Free Testing

In [19]:
x = dfv

In [20]:
y = df['sentiment']

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
lrm = LogisticRegression(max_iter = 1000)

In [25]:
lrm.fit(x_train, y_train)

In [26]:
y_pred = lrm.predict(x_test)

In [27]:
accuracy_score(y_test, y_pred)

0.8584

In [28]:
confusion_matrix(y_test, y_pred)

array([[6421, 1119],
       [1005, 6455]], dtype=int64)

In [29]:
lrm.score(x_test, y_test)

0.8584

In [31]:
dfv.to_csv('df_vector.csv')