# Word2Vec
**En este notebook se realizarán unas pruebas simples con el modelo Word2Vec de la libreria gensim y el Spooky Author's dataset

In [1]:
import gensim 
import base64
import nltk
import numpy as np
import pandas as pd
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter
from scipy.misc import imread
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
from sklearn.externals import joblib
from sklearn.manifold import TSNE
%matplotlib inline



In [2]:
#Agrego lemmatization
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [3]:
# Cargo datos 
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv('sample_submission.csv')

**Gensim pre-processing: Divide las oraciones en tokens, convierte todo en lowercase**

In [None]:
text_train= list(train.text.values)
print(text_train[:3],"\n")
#Tokenizacion para cada oracion: Le paso un string, devuelve una lista de strings (tokens)
for i in range(len(text_train)):
   text_train[i]= gensim.utils.simple_preprocess(text_train[i])
print("Matriz tokenizada de frases:",text_train[:3])

# Genero vocabulario y entreno el modelo
**Parametros relevantes:**

 -Size corresponde al tamaño o dimensionalidad de vector de caracteristicas
 -Window: La cantidad de palabras correspondientes a la ventana de contexto
 -#min_count: palabras con frecuencia menor a este valor se ignoran


In [7]:
modelo = gensim.models.Word2Vec(
        text_train,
        sg=1,
        size=125,
        window=2,
        min_count=5,
        workers=8,
         hs=1,
        negative=10)
modelo.train(text_train, total_examples=len(text_train), epochs=10)
say_vector = modelo['say']  # get vector for word 
print(len(say_vector))

#guardo modelo
#word2vecsaved = 'word2vec_model.sav'
#joblib.dump(modelo, word2vecsaved)

125



Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



### Una vez entrenado el modelo, hacemos Algunas pruebas ###

In [8]:
print(modelo.wv.similarity('man','man'))
print(modelo.wv.similarity('appeared','seemed'))
print(modelo.wv.similarity('cat','dog'))
print(modelo.wv.similarity('god','sun'))
modelo.wv.most_similar(positive='man',topn=5)

1.0
0.5323820025714207
0.30694479761200993
0.14496146023935153


[('gentleman', 0.5479372143745422),
 ('poet', 0.5269320011138916),
 ('woman', 0.5269132852554321),
 ('painter', 0.5195095539093018),
 ('villain', 0.5179144144058228)]

In [9]:
modelo.wv.doesnt_match("big ugly monster cat".split())

'cat'

In [10]:
w2v_wordmatrix=modelo.wv.vectors
print(w2v_wordmatrix.shape)
w2v_wordmatrix

(8465, 125)


array([[-0.1866464 , -0.07258397, -0.02347344, ...,  0.02865851,
         0.11161629, -0.03977592],
       [-0.3784538 ,  0.08856384, -0.14281073, ...,  0.2070911 ,
         0.24291275, -0.18555847],
       [-0.09305538,  0.0163871 ,  0.08069561, ...,  0.04641417,
         0.07374206,  0.1390312 ],
       ...,
       [ 0.10724758,  0.08260011,  0.2057975 , ..., -0.16666321,
        -0.09043068, -0.04211831],
       [-0.21171324, -0.38264614,  0.07061048, ..., -0.54591143,
         0.14919266,  0.03981583],
       [ 0.3579407 ,  0.11960463,  0.14988564, ...,  0.2265596 ,
        -0.04013533,  0.00141258]], dtype=float32)

# Fin