# Loading

In [23]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('df_stem.csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,review,sentiment,review_stem
0,0,0,one of the other reviewers has mentioned that ...,1,one review mention watch 1 oz episod youll hoo...
1,1,1,a wonderful little production the filming tech...,1,wonder littl product film techniqu unassum old...
2,2,2,i thought this was a wonderful way to spend ti...,1,thought wonder way spend time hot summer weeke...
3,3,3,basically theres a family where a little boy j...,0,basic there famili littl boy jake think there ...
4,4,4,petter matteis love in the time of money is a ...,1,petter mattei love time money visual stun film...


In [4]:
df = df[['review_stem', 'sentiment']]

In [5]:
df.head()

Unnamed: 0,review_stem,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


# Word 2 Vec

In [14]:
df['review_stem'] = df['review_stem'].apply(str.split)

In [15]:
df.head()

Unnamed: 0,review_stem,sentiment
0,"[one, review, mention, watch, 1, oz, episod, y...",1
1,"[wonder, littl, product, film, techniqu, unass...",1
2,"[thought, wonder, way, spend, time, hot, summe...",1
3,"[basic, there, famili, littl, boy, jake, think...",0
4,"[petter, mattei, love, time, money, visual, st...",1


In [16]:
model = Word2Vec(
    df['review_stem'],
    vector_size = 100,
    min_count = 1
)

In [17]:
len(model.wv.index_to_key)

181885

In [18]:
model.wv.most_similar('wonder')

[('great', 0.6824128031730652),
 ('amaz', 0.6797610521316528),
 ('fantast', 0.6533145904541016),
 ('brilliant', 0.6360771656036377),
 ('excel', 0.6277644038200378),
 ('terrif', 0.600382387638092),
 ('fabul', 0.5761121511459351),
 ('marvel', 0.5758160352706909),
 ('fine', 0.5739214420318604),
 ('astound', 0.5731454491615295)]

In [21]:
model.wv.most_similar('time')

[('twice', 0.5469274520874023),
 ('day', 0.5380311608314514),
 ('way', 0.5305366516113281),
 ('timeif', 0.5304754972457886),
 ('timei', 0.5264162421226501),
 ('occas', 0.5256339907646179),
 ('week', 0.5134643912315369),
 ('timeparodi', 0.5118454098701477),
 ('squanderedit', 0.5093278884887695),
 ('timeth', 0.5043607354164124)]

## Sentence Averaging

In [56]:
def sentence_average(keys):
    vector_sum = np.array([0.0] * 100)
    
    for key in keys:
        vector_sum += np.array(model.wv.get_vector(key).tolist())
    
    return vector_sum / len(keys)

In [57]:
sentence_average(df['review_stem'][0])

array([-5.57689330e-02,  5.21418432e-01,  4.91050559e-01, -2.84571879e-01,
       -5.87454927e-03, -6.73630366e-01, -3.53854042e-01,  3.76611589e-01,
       -4.63974146e-01, -3.07149139e-01, -1.12927188e-01, -8.33234988e-01,
        4.27014570e-02,  6.73994701e-01,  5.66837889e-01, -3.36153615e-02,
       -1.46911118e-01, -3.00407871e-02, -1.49899164e-01,  2.47353671e-02,
       -8.59027193e-02, -1.03197207e-01,  4.86804827e-01,  1.10103619e-01,
       -7.04092893e-01,  9.13449667e-02, -3.55593989e-01, -6.39741629e-01,
        3.16284365e-02,  1.95291038e-01,  3.15409029e-01, -4.50379260e-01,
        6.05889986e-01, -3.56425976e-02, -5.50486084e-03,  6.97342235e-01,
        6.41677645e-02,  5.32257961e-02, -8.77184814e-01, -6.96389089e-01,
       -1.89110002e-01, -2.89747734e-01, -3.61083988e-01,  1.10346500e-01,
        3.16985113e-01,  1.89548200e-01, -4.70210468e-01, -8.46225369e-01,
        4.32367741e-01,  1.83061593e-01, -6.95670312e-02,  2.26052588e-01,
        3.33184370e-01, -

## Vector Dataframe

In [58]:
dfv = pd.DataFrame()

In [65]:
dfv[[str(i) for i in range(100)]] = df.apply(lambda row: sentence_average(row[0]), axis = 1, result_type = "expand")

In [66]:
dfv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.055769,0.521418,0.491051,-0.284572,-0.005875,-0.67363,-0.353854,0.376612,-0.463974,-0.307149,...,0.260105,0.361337,0.051308,0.129544,0.093932,0.190868,0.277287,-0.274716,0.239495,-0.5514
1,-0.387298,0.067675,0.107511,-0.311522,0.282367,-0.803792,-0.096303,0.501002,-0.591078,-1.004327,...,0.250374,0.275806,-0.259445,0.161307,0.045556,0.284748,0.345185,-0.730644,0.290081,-0.267813
2,-0.363525,0.477643,0.263717,-0.138138,-0.020126,-0.627118,0.146612,0.464807,-0.623616,-0.767289,...,0.296554,0.225034,-0.172313,0.186274,0.291188,0.233296,0.546296,-0.240926,-0.007414,-0.581818
3,-0.210395,0.459685,0.625348,-0.269713,-0.091613,-0.382779,-0.382343,0.109192,-0.563171,-0.794181,...,0.123221,0.40046,0.031721,0.122218,0.147478,0.681805,0.357045,-0.613093,0.349137,-0.588786
4,-0.198249,0.207593,0.372784,-0.245931,0.417209,-0.632262,-0.022253,0.276544,-0.693726,-0.816664,...,0.084679,0.479013,-0.253804,0.075651,0.312607,0.240662,-0.058127,-0.10536,0.491676,-0.591702


In [67]:
dfv.shape

(50000, 100)

In [98]:
dfv.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,-0.255079,0.443361,0.29383,-0.220543,0.081928,-0.610497,-0.234926,0.220228,-0.633289,-0.719987,...,0.284044,0.279089,-0.263289,0.13285,0.261319,0.334235,0.32791,-0.336436,0.156604,-0.548328
std,0.18741,0.210143,0.187993,0.189239,0.206248,0.177413,0.182087,0.212106,0.208512,0.242197,...,0.168631,0.168095,0.212122,0.151294,0.150587,0.196508,0.212829,0.202299,0.257008,0.241762
min,-1.380826,-1.72645,-0.938676,-1.519571,-1.435194,-1.990211,-1.466972,-1.044476,-2.023175,-2.421847,...,-0.523581,-0.562852,-1.685461,-1.049153,-0.67757,-0.698739,-0.736302,-1.730731,-1.33951,-1.759691
25%,-0.369404,0.304539,0.174038,-0.343606,-0.045047,-0.720727,-0.352202,0.083868,-0.762608,-0.867559,...,0.168854,0.173383,-0.395897,0.03922,0.165756,0.202178,0.184526,-0.458586,-0.010817,-0.704933
50%,-0.246299,0.433876,0.291964,-0.217645,0.090262,-0.605098,-0.236243,0.227769,-0.625292,-0.699012,...,0.27159,0.279907,-0.25964,0.131871,0.262721,0.327812,0.323333,-0.328856,0.164645,-0.546122
75%,-0.132376,0.573029,0.411877,-0.095506,0.218626,-0.493906,-0.120878,0.363731,-0.49715,-0.551432,...,0.387106,0.386307,-0.125952,0.226648,0.356279,0.463114,0.464668,-0.20347,0.330337,-0.389723
max,0.744371,1.70363,1.428041,0.747193,1.001881,0.236836,0.748628,1.267297,0.305927,0.190495,...,1.276215,1.286585,0.8451,1.244772,1.410685,1.463961,1.369351,0.632063,1.760517,1.032578


# Free Testing

In [68]:
x = dfv

In [69]:
y = df['sentiment']

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [80]:
from sklearn.linear_model import LogisticRegression

In [92]:
lrm = LogisticRegression(max_iter = 1000)

In [93]:
lrm.fit(x_train, y_train)

In [102]:
y_pred = lrm.predict(x_test)

In [103]:
accuracy_score(y_test, y_pred)

0.8548

In [104]:
confusion_matrix(y_test, y_pred)

array([[6394, 1146],
       [1032, 6428]], dtype=int64)

In [105]:
lrm.score(x_test, y_test)

0.8548