<a href="https://colab.research.google.com/github/bilaloumehdi/TP_NLP/blob/master/TP2/TP2_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



### **Install gensim Library**

In [1]:
!pip install gensim



### **Train and save the Word2Vec model**

In [94]:
from gensim import utils

def read_corpus(filename):
  with open(filename,'r',) as file:
    for line in file:
      yield utils.simple_preprocess(line)


In [95]:
from gensim.models import Word2Vec

sentences = list(read_corpus('./text'))
print(sentences)

[['morocco', 'and', 'marrakech', 'tapestry', 'of', 'tradition', 'and', 'modernity', 'morocco', 'located', 'at', 'the', 'crossroads', 'of', 'europe', 'and', 'africa', 'is', 'country', 'drenched', 'in', 'history', 'mystery', 'and', 'cultural', 'richness', 'testament', 'to', 'the', 'ancient', 'civilizations', 'that', 'once', 'flourished', 'here', 'this', 'north', 'african', 'kingdom', 'boasts', 'unique', 'blend', 'of', 'arab', 'berber', 'and', 'european', 'influences', 'at', 'the', 'heart', 'of', 'morocco', 'rich', 'tapestry', 'lies', 'marrakech', 'one', 'of', 'its', 'four', 'imperial', 'cities', 'and', 'vibrant', 'epicenter', 'of', 'tradition', 'and', 'modernity', 'geographical', 'significance', 'morocco', 'is', 'bordered', 'by', 'the', 'atlantic', 'ocean', 'to', 'the', 'west', 'the', 'mediterranean', 'sea', 'to', 'the', 'north', 'algeria', 'to', 'the', 'east', 'and', 'southeast', 'and', 'the', 'vast', 'sahara', 'desert', 'to', 'the', 'south', 'its', 'strategic', 'location', 'has', 'hist

In [96]:
# train model
model = Word2Vec(sentences, min_count=1,vector_size=100, workers=4)

#save the model
model.save('./word_to_vec.model')

### **Vectorize words**

In [97]:
#load the model
model = Word2Vec.load('./word_to_vec.model')

In [98]:
#vectorize the text's words
words_vector = [model.wv[word] for word in sentences]
print(words_vector)

[array([[-0.00873809,  0.00226957, -0.00084624, ..., -0.00882923,
         0.00301891, -0.00663717],
       [-0.00864786,  0.00394358,  0.0052691 , ..., -0.00258319,
        -0.00938736,  0.00453307],
       [-0.00821526,  0.00950234, -0.00014681, ..., -0.00763057,
        -0.00236962, -0.00548861],
       ...,
       [-0.00452504, -0.00653396,  0.00555252, ..., -0.0029807 ,
         0.00523548, -0.0007321 ],
       [-0.00305008,  0.00378533,  0.00385601, ...,  0.00901304,
        -0.00359052,  0.00411143],
       [-0.00703125, -0.00560686,  0.00971714, ...,  0.00468867,
        -0.00697161,  0.00633496]], dtype=float32)]


In [109]:
# save the words vectors in a text format
model.wv.save_word2vec_format('words_vectors.text',binary=False)

### **Similarity**

In [113]:
# load the words vectors
from gensim.models.keyedvectors import KeyedVectors
loaded_word_vectors = KeyedVectors.load_word2vec_format("words_vectors.text", binary=False)


In [115]:
# similarity
print('most similarities to morocco : ')
word_morocco = loaded_word_vectors.most_similar("morocco",topn=5)
word_morocco

most similarities to morocco : 


[('comes', 0.3211769163608551),
 ('snow', 0.3105997145175934),
 ('mountains', 0.2844999432563782),
 ('against', 0.2412889003753662),
 ('berber', 0.20771968364715576)]

In [116]:
word_europe = loaded_word_vectors.most_similar('europe')
word_europe

[('west', 0.28858399391174316),
 ('blend', 0.24392668902873993),
 ('testament', 0.2290310263633728),
 ('red', 0.21492047607898712),
 ('many', 0.1961124837398529),
 ('crucial', 0.1894175410270691),
 ('case', 0.18915274739265442),
 ('arab', 0.17835131287574768),
 ('spices', 0.17564605176448822),
 ('heart', 0.1740351915359497)]

In [117]:
word_marrakech = loaded_word_vectors.most_similar('marrakech')
word_marrakech

[('evening', 0.29856181144714355),
 ('but', 0.2722093164920807),
 ('labyrinthine', 0.2573889493942261),
 ('averse', 0.24176569283008575),
 ('permeate', 0.2268962860107422),
 ('without', 0.22356857359409332),
 ('alleys', 0.21861228346824646),
 ('capped', 0.20676937699317932),
 ('designer', 0.1892811506986618),
 ('sahara', 0.18486753106117249)]