In [10]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
mp.use('Agg')

In [2]:
w2v = Word2Vec.load('word2vec.model').wv

In [3]:
type(w2v)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [4]:
help(w2v)

Help on Word2VecKeyedVectors in module gensim.models.keyedvectors object:

class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors)
 |  Mapping between words and vectors for the :class:`~gensim.models.Word2Vec` model.
 |  Used to perform operations on the vectors such as vector lookup, distance, similarity etc.
 |  
 |  Method resolution order:
 |      Word2VecKeyedVectors
 |      WordEmbeddingsKeyedVectors
 |      BaseKeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  get_keras_embedding(self, train_embeddings=False)
 |      Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings.
 |      
 |      Parameters
 |      ----------
 |      train_embeddings : bool
 |          If False, the weights are frozen and stopped from being updated.
 |          If True, the weights can/will be further trained/updated.
 |      
 |      Returns
 |      -------
 |      `keras.layers.Embedding`
 |          Embedd

In [5]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50)

In [6]:
def plot_model_history(model_history):
    """
    Plot Accuracy and Loss curves given the model_history
    """
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    fig.savefig('kmeans.png')
    plt.show()

In [12]:
model.fit(X = w2v.vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
       n_clusters=2, n_init=50, n_jobs=None, precompute_distances='auto',
       random_state=True, tol=0.0001, verbose=0)

In [13]:
w2v.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('to', 0.996428370475769),
 ('me', 0.99398273229599),
 ('in', 0.9932867288589478),
 ('the', 0.9924371242523193),
 ('and', 0.9920103549957275),
 ('my', 0.9919503927230835),
 ('i', 0.9903295636177063),
 ('asked', 0.989032506942749),
 ('i_was', 0.9860483407974243),
 ('for', 0.984266996383667)]

In [14]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [15]:
words = pd.DataFrame(w2v.vocab.keys())
words

Unnamed: 0,0
0,just
1,like
2,any
3,other
4,day
...,...
201,say
202,things
203,bully
204,rakesh


In [16]:
words.columns = ['words']

In [17]:
words

Unnamed: 0,words
0,just
1,like
2,any
3,other
4,day
...,...
201,say
202,things
203,bully
204,rakesh


In [19]:
words['vectors'] = words.words.apply(lambda x: w2v.wv[f'{x}'])

  """Entry point for launching an IPython kernel.


In [20]:
words

Unnamed: 0,words,vectors
0,just,"[0.06401228, 0.10244543, -0.12459137, 0.049309..."
1,like,"[0.041160908, 0.076062895, -0.16023546, 0.0556..."
2,any,"[-0.004705759, 0.012123186, -0.056069445, -0.0..."
3,other,"[0.036368866, 0.110916555, -0.108262785, 0.092..."
4,day,"[0.07634358, 0.10929913, -0.15942724, 0.026457..."
...,...,...
201,say,"[0.06571426, 0.08929079, -0.1438337, 0.0303185..."
202,things,"[0.043621585, 0.08232436, -0.16201611, 0.04404..."
203,bully,"[0.035272025, 0.112041526, -0.13689706, 0.0334..."
204,rakesh,"[0.06330622, 0.098207384, -0.15660104, 0.03151..."


In [21]:
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))

In [22]:
words

Unnamed: 0,words,vectors,cluster
0,just,"[0.06401228, 0.10244543, -0.12459137, 0.049309...",[0]
1,like,"[0.041160908, 0.076062895, -0.16023546, 0.0556...",[0]
2,any,"[-0.004705759, 0.012123186, -0.056069445, -0.0...",[0]
3,other,"[0.036368866, 0.110916555, -0.108262785, 0.092...",[0]
4,day,"[0.07634358, 0.10929913, -0.15942724, 0.026457...",[0]
...,...,...,...
201,say,"[0.06571426, 0.08929079, -0.1438337, 0.0303185...",[0]
202,things,"[0.043621585, 0.08232436, -0.16201611, 0.04404...",[0]
203,bully,"[0.035272025, 0.112041526, -0.13689706, 0.0334...",[0]
204,rakesh,"[0.06330622, 0.098207384, -0.15660104, 0.03151...",[0]


In [23]:
words.cluster = words.cluster.apply(lambda x: x[0])

In [24]:
words

Unnamed: 0,words,vectors,cluster
0,just,"[0.06401228, 0.10244543, -0.12459137, 0.049309...",0
1,like,"[0.041160908, 0.076062895, -0.16023546, 0.0556...",0
2,any,"[-0.004705759, 0.012123186, -0.056069445, -0.0...",0
3,other,"[0.036368866, 0.110916555, -0.108262785, 0.092...",0
4,day,"[0.07634358, 0.10929913, -0.15942724, 0.026457...",0
...,...,...,...
201,say,"[0.06571426, 0.08929079, -0.1438337, 0.0303185...",0
202,things,"[0.043621585, 0.08232436, -0.16201611, 0.04404...",0
203,bully,"[0.035272025, 0.112041526, -0.13689706, 0.0334...",0
204,rakesh,"[0.06330622, 0.098207384, -0.15660104, 0.03151...",0


In [28]:
# words['cluster_value'] = [1 if i==0 1 elif i == 1 else 2 for i in words.cluster]
for w in words.cluster:
    if w == 0:
        words['cluster_value'] = 0
    elif w == 1:
        words['cluster_value'] = 1
    else:
        words['cluster_value'] = 2

In [29]:
words

Unnamed: 0,words,vectors,cluster,cluster_value
0,just,"[0.06401228, 0.10244543, -0.12459137, 0.049309...",0,0
1,like,"[0.041160908, 0.076062895, -0.16023546, 0.0556...",0,0
2,any,"[-0.004705759, 0.012123186, -0.056069445, -0.0...",0,0
3,other,"[0.036368866, 0.110916555, -0.108262785, 0.092...",0,0
4,day,"[0.07634358, 0.10929913, -0.15942724, 0.026457...",0,0
...,...,...,...,...
201,say,"[0.06571426, 0.08929079, -0.1438337, 0.0303185...",0,0
202,things,"[0.043621585, 0.08232436, -0.16201611, 0.04404...",0,0
203,bully,"[0.035272025, 0.112041526, -0.13689706, 0.0334...",0,0
204,rakesh,"[0.06330622, 0.098207384, -0.15660104, 0.03151...",0,0


In [30]:
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)

In [31]:
words

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score
0,just,"[0.06401228, 0.10244543, -0.12459137, 0.049309...",0,0,3.027561
1,like,"[0.041160908, 0.076062895, -0.16023546, 0.0556...",0,0,2.220271
2,any,"[-0.004705759, 0.012123186, -0.056069445, -0.0...",0,0,1.401266
3,other,"[0.036368866, 0.110916555, -0.108262785, 0.092...",0,0,1.359943
4,day,"[0.07634358, 0.10929913, -0.15942724, 0.026457...",0,0,2.490357
...,...,...,...,...,...
201,say,"[0.06571426, 0.08929079, -0.1438337, 0.0303185...",0,0,4.410727
202,things,"[0.043621585, 0.08232436, -0.16201611, 0.04404...",0,0,2.759315
203,bully,"[0.035272025, 0.112041526, -0.13689706, 0.0334...",0,0,4.678192
204,rakesh,"[0.06330622, 0.098207384, -0.15660104, 0.03151...",0,0,4.602103


In [32]:
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [37]:
words[150:200]

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
150,wasn_t,"[0.077366784, 0.10052142, -0.07932672, 0.06703...",0,0,1.95022,0.0
151,home,"[-0.0022268514, 0.1320761, -0.10145202, 0.0726...",0,0,2.055336,0.0
152,always,"[0.010727334, 0.10077481, -0.13231076, 0.06112...",0,0,2.791127,0.0
153,who,"[0.041929875, 0.06628057, -0.0635532, 0.032363...",0,0,1.556763,0.0
154,said,"[0.073347755, 0.08684462, -0.11043333, 0.01335...",0,0,1.807974,0.0
155,or,"[0.09029762, 0.11544328, -0.0903675, 0.0237198...",0,0,1.973365,0.0
156,some,"[0.06544155, 0.08807347, -0.111523494, 0.04467...",0,0,3.464964,0.0
157,noticed,"[0.0069835912, 0.10363454, -0.14654888, 0.0522...",0,0,2.110968,0.0
158,where,"[0.013180582, 0.040102858, -0.01023495, -0.055...",1,0,1.159982,0.0
159,did,"[0.058951147, 0.12304556, -0.15273815, 0.01902...",0,0,1.892291,0.0


In [38]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)