In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
mp.use('Agg')

In [2]:
w2v = Word2Vec.load('word2vec.model').wv

In [3]:
type(w2v)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [4]:
help(w2v)

Help on Word2VecKeyedVectors in module gensim.models.keyedvectors object:

class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors)
 |  Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model.
 |  Used to perform operations on the vectors such as vector lookup, distance, similarity etc.
 |  
 |  Method resolution order:
 |      Word2VecKeyedVectors
 |      WordEmbeddingsKeyedVectors
 |      BaseKeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  get_keras_embedding(self, train_embeddings=False)
 |      Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings.
 |      
 |      Parameters
 |      ----------
 |      train_embeddings : bool
 |          If False, the weights are frozen and stopped from being updated.
 |          If True, the weights can/will be further trained/updated.
 |      
 |      Returns
 |      -------
 |      `keras.layers.Embedding`
 |          Embedd

In [5]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50)

In [6]:
model.fit(X = w2v.vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
       n_clusters=2, n_init=50, n_jobs=None, precompute_distances='auto',
       random_state=True, tol=0.0001, verbose=0)

In [7]:
w2v.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('stuck_with', 0.999708890914917),
 ('wrong_with', 0.999708890914917),
 ('a_secret', 0.9996526837348938),
 ('wont_grow', 0.9996511936187744),
 ('clique_in', 0.999650239944458),
 ('his', 0.9996493458747864),
 ('issues', 0.9996489882469177),
 ('she_slapped', 0.9996485114097595),
 ('girls', 0.9996480941772461),
 ('call', 0.9996480941772461)]

In [8]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [9]:
words = pd.DataFrame(w2v.vocab.keys())
words

Unnamed: 0,0
0,just
1,like
2,any_other
3,day
4,employees
...,...
2605,23
2606,hat
2607,women_working
2608,good_days


In [10]:
words.columns = ['words']

In [11]:
words

Unnamed: 0,words
0,just
1,like
2,any_other
3,day
4,employees
...,...
2605,23
2606,hat
2607,women_working
2608,good_days


In [12]:
words['vectors'] = words.words.apply(lambda x: w2v.wv[f'{x}'])

  """Entry point for launching an IPython kernel.


In [13]:
words

Unnamed: 0,words,vectors
0,just,"[-0.037347253, 0.03204588, 0.008825552, -0.012..."
1,like,"[-0.037829906, 0.031295884, 0.008807496, -0.01..."
2,any_other,"[-0.038022686, 0.031122405, 0.008552859, -0.01..."
3,day,"[-0.037317507, 0.031464763, 0.008880943, -0.01..."
4,employees,"[-0.036987383, 0.03140659, 0.008221837, -0.012..."
...,...,...
2605,23,"[-0.03814631, 0.032437176, 0.008740556, -0.013..."
2606,hat,"[-0.03751088, 0.031898227, 0.0085191205, -0.01..."
2607,women_working,"[-0.037786897, 0.03131441, 0.009386631, -0.012..."
2608,good_days,"[-0.03704566, 0.032124337, 0.009112182, -0.012..."


In [14]:
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))

In [15]:
words

Unnamed: 0,words,vectors,cluster
0,just,"[-0.037347253, 0.03204588, 0.008825552, -0.012...",[1]
1,like,"[-0.037829906, 0.031295884, 0.008807496, -0.01...",[1]
2,any_other,"[-0.038022686, 0.031122405, 0.008552859, -0.01...",[1]
3,day,"[-0.037317507, 0.031464763, 0.008880943, -0.01...",[1]
4,employees,"[-0.036987383, 0.03140659, 0.008221837, -0.012...",[1]
...,...,...,...
2605,23,"[-0.03814631, 0.032437176, 0.008740556, -0.013...",[1]
2606,hat,"[-0.03751088, 0.031898227, 0.0085191205, -0.01...",[1]
2607,women_working,"[-0.037786897, 0.03131441, 0.009386631, -0.012...",[1]
2608,good_days,"[-0.03704566, 0.032124337, 0.009112182, -0.012...",[1]


In [16]:
words.cluster = words.cluster.apply(lambda x: x[0])

In [17]:
words

Unnamed: 0,words,vectors,cluster
0,just,"[-0.037347253, 0.03204588, 0.008825552, -0.012...",1
1,like,"[-0.037829906, 0.031295884, 0.008807496, -0.01...",1
2,any_other,"[-0.038022686, 0.031122405, 0.008552859, -0.01...",1
3,day,"[-0.037317507, 0.031464763, 0.008880943, -0.01...",1
4,employees,"[-0.036987383, 0.03140659, 0.008221837, -0.012...",1
...,...,...,...
2605,23,"[-0.03814631, 0.032437176, 0.008740556, -0.013...",1
2606,hat,"[-0.03751088, 0.031898227, 0.0085191205, -0.01...",1
2607,women_working,"[-0.037786897, 0.03131441, 0.009386631, -0.012...",1
2608,good_days,"[-0.03704566, 0.032124337, 0.009112182, -0.012...",1


In [18]:
# words['cluster_value'] = [1 if i==0 1 elif i == 1 else 2 for i in words.cluster]
for w in words.cluster:
    if w == 0:
        words['cluster_value'] = 0
    elif w == 1:
        words['cluster_value'] = 1
    else:
        words['cluster_value'] = 2

In [19]:
words

Unnamed: 0,words,vectors,cluster,cluster_value
0,just,"[-0.037347253, 0.03204588, 0.008825552, -0.012...",1,1
1,like,"[-0.037829906, 0.031295884, 0.008807496, -0.01...",1,1
2,any_other,"[-0.038022686, 0.031122405, 0.008552859, -0.01...",1,1
3,day,"[-0.037317507, 0.031464763, 0.008880943, -0.01...",1,1
4,employees,"[-0.036987383, 0.03140659, 0.008221837, -0.012...",1,1
...,...,...,...,...
2605,23,"[-0.03814631, 0.032437176, 0.008740556, -0.013...",1,1
2606,hat,"[-0.03751088, 0.031898227, 0.0085191205, -0.01...",1,1
2607,women_working,"[-0.037786897, 0.03131441, 0.009386631, -0.012...",1,1
2608,good_days,"[-0.03704566, 0.032124337, 0.009112182, -0.012...",1,1


In [20]:
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)

In [21]:
words

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score
0,just,"[-0.037347253, 0.03204588, 0.008825552, -0.012...",1,1,193.582539
1,like,"[-0.037829906, 0.031295884, 0.008807496, -0.01...",1,1,247.247108
2,any_other,"[-0.038022686, 0.031122405, 0.008552859, -0.01...",1,1,129.350482
3,day,"[-0.037317507, 0.031464763, 0.008880943, -0.01...",1,1,226.118326
4,employees,"[-0.036987383, 0.03140659, 0.008221837, -0.012...",1,1,155.174718
...,...,...,...,...,...
2605,23,"[-0.03814631, 0.032437176, 0.008740556, -0.013...",1,1,99.823415
2606,hat,"[-0.03751088, 0.031898227, 0.0085191205, -0.01...",1,1,150.167284
2607,women_working,"[-0.037786897, 0.03131441, 0.009386631, -0.012...",1,1,140.775724
2608,good_days,"[-0.03704566, 0.032124337, 0.009112182, -0.012...",1,1,153.723184


In [22]:
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [26]:
words[200:400]

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
200,hand_how,"[-0.037707478, 0.031078933, 0.00926817, -0.012...",1,1,168.861916,168.861916
201,many_migraines,"[-0.03778149, 0.032305326, 0.009484534, -0.012...",1,1,111.033084,111.033084
202,have,"[-0.037750963, 0.03171545, 0.008546702, -0.012...",1,1,304.992715,304.992715
203,since,"[-0.03788494, 0.03150662, 0.008628834, -0.0128...",1,1,186.585495,186.585495
204,when,"[-0.03725183, 0.03125704, 0.008479883, -0.0126...",1,1,273.868109,273.868109
...,...,...,...,...,...,...
395,reported_it,"[-0.037263047, 0.031491153, 0.008666071, -0.01...",1,1,157.454923,157.454923
396,hr,"[-0.037134275, 0.032188576, 0.008572761, -0.01...",1,1,163.692869,163.692869
397,they_said,"[-0.03744472, 0.032116063, 0.008027106, -0.012...",1,1,136.746273,136.746273
398,causing,"[-0.037467968, 0.03231595, 0.009021842, -0.013...",1,1,118.459111,118.459111


In [24]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)