In [9]:
#! pip install gensim  
#! pip install nltk 

In [80]:
import nltk
import gensim
import numpy as np
import random
from sklearn.decomposition import PCA

%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D



### Data
This data is about "Lista de precios" from electrical field

In [13]:
text_file = open("Lista de precios word2vec.txt","r",encoding='utf-8')
corpus = text_file.readlines()
random.shuffle(corpus)


### Word and tokens
We will convert all of the words into lower-case using NLTK.

In [18]:
import nltk
#nltk.download('punkt')
data = []
for doc in corpus:
    t = []
    for word in nltk.tokenize.word_tokenize(doc,language="spanish"):
        t.append(word.lower())
    data.append(t)


In [25]:
data[10]

['unidades', 'de', 'obra', 'ingenieria', 'y', 'obras', 'de', 'distribución']

### The model
We are going to use gensim

### SkipGram
Creating a SikipGra method Word2vec

In [32]:
model = gensim.models.Word2Vec(data,vector_size=100, window=5,min_count=5,sg=1)

### Testing the model
Compare similarity between words

In [42]:
print(model.wv.similarity('poste','madera'))
print(model.wv.similarity('kva','kv'))
print(model.wv.similarity('poste','montaje'))
print(model.wv.similarity('cable','precios'))

0.99276
0.954768
0.99447864
0.8342199


Lets see the vector for a given word

In [43]:
word_vectors = model.wv
word_vectors.word_vec('poste')

  word_vectors.word_vec('poste')


array([-0.20059785,  0.11278027, -0.07441311,  0.1422241 , -0.11879149,
       -0.24864164,  0.15658547,  0.26585075, -0.14554498, -0.27847844,
        0.08352139, -0.12401766,  0.09816351,  0.04716201, -0.04990553,
       -0.11632436,  0.12220698, -0.23120986, -0.15066807, -0.32741946,
       -0.02226929,  0.0207325 ,  0.08792902, -0.06576914,  0.00531124,
       -0.03992412, -0.1199858 , -0.1587133 , -0.12957661, -0.09448987,
        0.07515287,  0.01558147,  0.19144884, -0.13571484, -0.04840292,
        0.19639423, -0.01304033, -0.14905684, -0.11828177, -0.08448145,
        0.05726487, -0.20362283, -0.06399065, -0.02193841,  0.18674952,
       -0.11188079, -0.0160448 ,  0.08549301,  0.01480161,  0.11716761,
        0.22016124, -0.02673917, -0.00819193,  0.11385458, -0.1722633 ,
        0.2084154 ,  0.17727394, -0.00713337, -0.09620961,  0.14633164,
       -0.02572851,  0.00629757,  0.09253932, -0.00404375, -0.24400717,
        0.0558572 ,  0.04975931,  0.17482637, -0.1885932 ,  0.19

List of words similar to a given model

In [75]:
print(word_vectors.most_similar('cámara'))

[('existentes', 0.9979122877120972), (';', 0.9977399110794067), ('civil', 0.9977132678031921), ('acuerdo', 0.9977064728736877), ('6', 0.9976826310157776), ('exterior', 0.9975786209106445), ('río', 0.9975457787513733), ('equipo', 0.9975296258926392), ('campos', 0.9975120425224304), ('linea', 0.9974679946899414)]


### Word cloud
3D graphs of words we are interested in

In [76]:
vocabulario = [
    'poste','montaje','centro','cable','vereda','madera','conductor','transformador','cámara','tablero','existentes'
]


In [77]:
vectors = []

for v in vocabulario:
    vectors.append(word_vectors.word_vec(v))
print(type(vectors))
vectors = np.array(vectors)
print(type(vectors))

<class 'list'>
<class 'numpy.ndarray'>


  vectors.append(word_vectors.word_vec(v))


Turning the 100 dimension vector into a 3 dimension with principal component analysis

In [78]:
pca = PCA(n_components=3)
pca_vectors = pca.fit_transform(vectors)

Creating a 3D plot to see the position of every word

In [84]:
fig = plt.figure(figsize=(10,9))
ax = fig.add_subplot(111,projection='3d')
for i in range(len(pca_vectors)):
    w = pca_vectors[i]
    ax.scatter(w[0],w[1],w[2])
    ax.text(w[0],w[1],w[2],vocabulario[i],fontsize=10)
    

In [83]:
from IPython.core.display import display,HTML
display(HTML("<style>.container:{width:100% !important;}</style>"))

  from IPython.core.display import display,HTML
