In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
mp.use('Agg')

In [2]:
w2v = Word2Vec.load('word2vec.model').wv

In [3]:
type(w2v)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [4]:
help(w2v)

Help on Word2VecKeyedVectors in module gensim.models.keyedvectors object:

class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors)
 |  Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model.
 |  Used to perform operations on the vectors such as vector lookup, distance, similarity etc.
 |  
 |  Method resolution order:
 |      Word2VecKeyedVectors
 |      WordEmbeddingsKeyedVectors
 |      BaseKeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  get_keras_embedding(self, train_embeddings=False)
 |      Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings.
 |      
 |      Parameters
 |      ----------
 |      train_embeddings : bool
 |          If False, the weights are frozen and stopped from being updated.
 |          If True, the weights can/will be further trained/updated.
 |      
 |      Returns
 |      -------
 |      `keras.layers.Embedding`
 |          Embedd

In [5]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50)

In [6]:
model.fit(X = w2v.vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
       n_clusters=2, n_init=50, n_jobs=None, precompute_distances='auto',
       random_state=True, tol=0.0001, verbose=0)

In [7]:
w2v.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('solution', 0.9999999403953552),
 ('happy', 0.998881459236145),
 ('verbal_abuse', 0.9988809823989868),
 ('ashamed', 0.998880922794342),
 ('pics', 0.9988807439804077),
 ('kiss_me', 0.9988803267478943),
 ('ripped', 0.998879075050354),
 ('the_workplace', 0.9988779425621033),
 ('frozen', 0.998877763748169),
 ('until', 0.9988775849342346)]

In [8]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [9]:
words = pd.DataFrame(w2v.vocab.keys())
words

Unnamed: 0,0
0,i
1,was
2,in
3,class
4,11
...,...
2261,if_rakesh
2262,rakesh_said
2263,this_disease
2264,has_caused


In [10]:
words.columns = ['words']

In [11]:
words

Unnamed: 0,words
0,i
1,was
2,in
3,class
4,11
...,...
2261,if_rakesh
2262,rakesh_said
2263,this_disease
2264,has_caused


In [12]:
words['vectors'] = words.words.apply(lambda x: w2v.wv[f'{x}'])

  """Entry point for launching an IPython kernel.


In [13]:
words

Unnamed: 0,words,vectors
0,i,"[-0.075747065, -0.06882905, 0.04576603, -0.057..."
1,was,"[-0.07550424, -0.06823647, 0.045346428, -0.057..."
2,in,"[-0.07571099, -0.06848269, 0.04584541, -0.0574..."
3,class,"[-0.075262405, -0.06871221, 0.045461968, -0.05..."
4,11,"[-0.07570061, -0.06822603, 0.04535558, -0.0581..."
...,...,...
2261,if_rakesh,"[-0.07578841, -0.06932318, 0.045399107, -0.057..."
2262,rakesh_said,"[-0.07554184, -0.06841819, 0.04632408, -0.0576..."
2263,this_disease,"[-0.07523805, -0.06840317, 0.046243936, -0.057..."
2264,has_caused,"[-0.074823126, -0.06899181, 0.046322018, -0.05..."


In [14]:
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))

In [15]:
words

Unnamed: 0,words,vectors,cluster
0,i,"[-0.075747065, -0.06882905, 0.04576603, -0.057...",[1]
1,was,"[-0.07550424, -0.06823647, 0.045346428, -0.057...",[1]
2,in,"[-0.07571099, -0.06848269, 0.04584541, -0.0574...",[1]
3,class,"[-0.075262405, -0.06871221, 0.045461968, -0.05...",[1]
4,11,"[-0.07570061, -0.06822603, 0.04535558, -0.0581...",[1]
...,...,...,...
2261,if_rakesh,"[-0.07578841, -0.06932318, 0.045399107, -0.057...",[1]
2262,rakesh_said,"[-0.07554184, -0.06841819, 0.04632408, -0.0576...",[1]
2263,this_disease,"[-0.07523805, -0.06840317, 0.046243936, -0.057...",[1]
2264,has_caused,"[-0.074823126, -0.06899181, 0.046322018, -0.05...",[1]


In [16]:
words.cluster = words.cluster.apply(lambda x: x[0])

In [17]:
words

Unnamed: 0,words,vectors,cluster
0,i,"[-0.075747065, -0.06882905, 0.04576603, -0.057...",1
1,was,"[-0.07550424, -0.06823647, 0.045346428, -0.057...",1
2,in,"[-0.07571099, -0.06848269, 0.04584541, -0.0574...",1
3,class,"[-0.075262405, -0.06871221, 0.045461968, -0.05...",1
4,11,"[-0.07570061, -0.06822603, 0.04535558, -0.0581...",1
...,...,...,...
2261,if_rakesh,"[-0.07578841, -0.06932318, 0.045399107, -0.057...",1
2262,rakesh_said,"[-0.07554184, -0.06841819, 0.04632408, -0.0576...",1
2263,this_disease,"[-0.07523805, -0.06840317, 0.046243936, -0.057...",1
2264,has_caused,"[-0.074823126, -0.06899181, 0.046322018, -0.05...",1


In [18]:
# words['cluster_value'] = [1 if i==0 1 elif i == 1 else 2 for i in words.cluster]
for w in words.cluster:
    if w == 0:
        words['cluster_value'] = 0
    elif w == 1:
        words['cluster_value'] = 1
    else:
        words['cluster_value'] = 2

In [19]:
words

Unnamed: 0,words,vectors,cluster,cluster_value
0,i,"[-0.075747065, -0.06882905, 0.04576603, -0.057...",1,1
1,was,"[-0.07550424, -0.06823647, 0.045346428, -0.057...",1,1
2,in,"[-0.07571099, -0.06848269, 0.04584541, -0.0574...",1,1
3,class,"[-0.075262405, -0.06871221, 0.045461968, -0.05...",1,1
4,11,"[-0.07570061, -0.06822603, 0.04535558, -0.0581...",1,1
...,...,...,...,...
2261,if_rakesh,"[-0.07578841, -0.06932318, 0.045399107, -0.057...",1,1
2262,rakesh_said,"[-0.07554184, -0.06841819, 0.04632408, -0.0576...",1,1
2263,this_disease,"[-0.07523805, -0.06840317, 0.046243936, -0.057...",1,1
2264,has_caused,"[-0.074823126, -0.06899181, 0.046322018, -0.05...",1,1


In [20]:
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)

  """Entry point for launching an IPython kernel.


In [21]:
words

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score
0,i,"[-0.075747065, -0.06882905, 0.04576603, -0.057...",1,1,180.496383
1,was,"[-0.07550424, -0.06823647, 0.045346428, -0.057...",1,1,210.831901
2,in,"[-0.07571099, -0.06848269, 0.04584541, -0.0574...",1,1,340.764600
3,class,"[-0.075262405, -0.06871221, 0.045461968, -0.05...",1,1,245.770675
4,11,"[-0.07570061, -0.06822603, 0.04535558, -0.0581...",1,1,128.665636
...,...,...,...,...,...
2261,if_rakesh,"[-0.07578841, -0.06932318, 0.045399107, -0.057...",1,1,101.455225
2262,rakesh_said,"[-0.07554184, -0.06841819, 0.04632408, -0.0576...",1,1,119.459317
2263,this_disease,"[-0.07523805, -0.06840317, 0.046243936, -0.057...",1,1,137.255893
2264,has_caused,"[-0.074823126, -0.06899181, 0.046322018, -0.05...",1,1,117.103563


In [22]:
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [23]:
words[150:200]

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
150,who,"[-0.07556464, -0.06873704, 0.04588552, -0.0578...",1,1,238.925753,238.925753
151,once,"[-0.0757213, -0.06869393, 0.045420565, -0.0579...",1,1,218.337019,218.337019
152,very_close,"[-0.074609675, -0.06896993, 0.045199975, -0.05...",1,1,103.731178,103.731178
153,younger_brother,"[-0.07555058, -0.06919016, 0.04644517, -0.0583...",1,1,107.517364,107.517364
154,of_mine,"[-0.07719445, -0.06819945, 0.04667286, -0.0576...",1,1,53.626643,53.626643
155,point,"[-0.075444974, -0.06839889, 0.046304647, -0.05...",1,1,123.013986,123.013986
156,he,"[-0.075445764, -0.06874796, 0.045975536, -0.05...",1,1,301.967154,301.967154
157,to_pull,"[-0.075072676, -0.068783544, 0.04531798, -0.05...",1,1,146.548512,146.548512
158,my_cheeks,"[-0.074997894, -0.06811643, 0.045605626, -0.05...",1,1,218.510706,218.510706
159,at_first,"[-0.07547845, -0.06847387, 0.046001263, -0.057...",1,1,215.47524,215.47524


In [24]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)