# Word Embeddings

In [1]:
data_in = './data_in/'
data_out = './data_out/'
data_prefix = 'zapatistas'

In [2]:
OHCO = ['text_id', 'section_num', 'para_num', 'sent_num', 'token_num']
PARA = OHCO[:3] # Paragraphs
BAG = PARA

In [3]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly_express as px



In [19]:
class W2VExplorer:
    
    w2v_min_count:int = 10
    w2v_workers:int = 4
    w2v_size:int = 246
    w2v_window:int = 2
    tsne_perplexity:int = 40
    tsne_n_components:int = 2
    tsne_init:str = 'pca'
    tsne_n_iter:int = 2500
    tsne_random_state:int = 23
        
    
    def __init__(self, tokens, bag):
        self.TOKENS:pd.DataFrame = tokens
        self.BAG = bag
        
    def generate_model(self):
        print("Bag:", self.BAG[-1])
        print('Getting DOCS')
        self._get_docs()
        print("Getting MODEL")
        self._get_model()
        print("Getting tSNE Coords")
        self._get_tsne_coords()
        print(u'\u2713')
        return self
        
    def _get_docs(self):
        self.DOCS = self.TOKENS.groupby(self.BAG)\
            .term_str.apply(lambda  x:  x.tolist())\
            .reset_index()['term_str'].tolist()
        
    def _get_model(self):
        self.model = word2vec.Word2Vec(self.DOCS, 
                                 vector_size = self.w2v_size, 
                                 window = self.w2v_window, 
                                 min_count = self.w2v_min_count, 
                                 workers = self.w2v_workers)

    def _get_tsne_coords(self):
        self.coords = pd.DataFrame(index=range(len(self.model.wv.key_to_index.values())))
        self.coords['label'] = [w for w in self.model.wv.index_to_key]
        self.coords['vector'] = self.coords['label'].apply(lambda x: self.model.wv.get_vector(x))
        self.tsne_model = TSNE(perplexity = 40, 
                               n_components = self.tsne_n_components, 
                               init = self.tsne_init, 
                               n_iter = self.tsne_n_iter, 
                               random_state = self.tsne_random_state)
        self.tsne_values = self.tsne_model.fit_transform(self.coords['vector'].tolist())
        self.coords['x'] = self.tsne_values[:,0]
        self.coords['y'] = self.tsne_values[:,1]
        
    def plot_tsne(self):
        px.scatter(self.coords, 'x', 'y', text='label', height=1000)\
            .update_traces(mode='text').show()
        
    def complete_analogy(self, A, B, C, n=2):
        try:
            cols = ['term', 'sim']
            return pd.DataFrame(self.model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)
        except KeyError as e:
            print('Error:', e)
            return None

    def get_most_similar(self, positive, negative=None):
        return pd.DataFrame(self.model.wv.most_similar(positive, negative), columns=['term', 'sim'])        

In [5]:
TOKENS = pd.read_csv('{}/{}-TOKEN.csv'.format(data_in, data_prefix)).set_index(OHCO)
LIB = pd.read_csv('{}/{}-LIB.csv'.format(data_in, data_prefix)).set_index('text_id')

In [6]:
TOKENS = TOKENS[~TOKENS.pos.str.match(r'NNPS?')]

In [20]:
model = W2VExplorer(TOKENS, BAG)
model.w2v_min_count = 60
model.generate_model()

Bag: para_num
Getting DOCS
Getting MODEL
Getting tSNE Coords
✓


<__main__.W2VExplorer at 0x1e5fc694ba8>

In [21]:
model.plot_tsne()

In [22]:
model.complete_analogy('men', 'women', 'government', 5)

Unnamed: 0,term,sim
0,by,0.987886
1,federal,0.986797
2,political,0.983968
3,governments,0.983539
4,from,0.981715


In [23]:
model.get_most_similar('indigenous')

Unnamed: 0,term,sim
0,campesinos,0.998468
1,workers,0.998367
2,against,0.998051
3,demand,0.997755
4,forces,0.997585
5,communities,0.997463
6,mexican,0.997413
7,troops,0.997192
8,land,0.997171
9,which,0.99696


In [33]:
model.get_most_similar('democratic')

Unnamed: 0,term,sim
0,party,0.998872
1,movement,0.99882
2,among,0.998817
3,new,0.998804
4,law,0.998769
5,force,0.998714
6,during,0.998675
7,different,0.998598
8,power,0.998575
9,group,0.998508


In [38]:
model.complete_analogy('brothers', 'sisters', 'revolutionary', 5)

Unnamed: 0,term,sim
0,armed,0.971952
1,democratic,0.971115
2,during,0.970991
3,words,0.970938
4,same,0.970927
