In [1]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import nltk
from nltk.corpus import stopwords

In [3]:
import warnings
warnings.filterwarnings("ignore")

### Data Preparation

In [4]:
def read_data(path):
    with open(path, encoding = "utf8") as file:
        text = file.read()
    return text

In [5]:
textual_data = read_data("./data.txt")

In [6]:
print(textual_data)

Deepika Padukone and Ranveer Singh wedding was one of the biggest Bollywood events that happened in 2018. The Deepika and Ranveer celebrations not only had all of us hooked on to our phones, waiting for what's to come but also gave enough and more reason to believe just how stylish these two are as a couple. From airport looks to reception parties and everything in between here's an entire timeline of Deepika and Ranveer's wedding style file! 
Not Ambanis, Deepika Ranveer or Priyanka Nick. Man proves his was Wedding of The Year 
This year was the year of big fat lavish and extravagant weddings. From Isha Ambani and Anand Piramal to Deepika Padukone and Ranveer Singh, from Priyanka Chopra and Nick Jonas to Kapil Sharma and Ginni Chatrath, 2018 saw many grand weddings. But nothing beats this man, who just w on the wedding of The Year award from social media.
Priyanka also shared a video featuring Nick Jonas was also celebrating with them 
The family first celebrated Christmas in London
P

####  Sentence Tokenizaiton

In [7]:
sentences = nltk.sent_tokenize(textual_data)
print(len(sentences))

18


#### Word Tokenization

In [8]:
en_stopwords = set(stopwords.words("english"))

In [9]:
print(en_stopwords)

{'all', 'down', 'yourselves', 'having', 'himself', 'for', 'aren', 'its', 'above', 'few', 'too', 'into', 'off', 'doing', 'don', 'or', 'him', 'o', 'your', 'with', 'didn', 'a', 'just', 's', 'that', "mightn't", 'between', 'not', 'them', 'most', 'she', 're', 'whom', 'it', "it's", 'is', 'should', 'myself', 'me', 'being', 'because', 'hasn', 'where', 'both', "you've", 'at', "aren't", "wasn't", 'theirs', "shouldn't", 'they', 'her', "weren't", 'ourselves', 'until', 'd', "you're", 'my', 'themselves', 'ain', 'nor', 'and', 'weren', 'in', 'through', 'during', 'ours', 'than', 'further', 'itself', 'which', 'but', 'own', 'does', 'of', 'what', 'why', 'been', 'hers', 'such', 'be', 'this', "doesn't", 'were', 'y', "you'd", 'there', 'any', 'some', "needn't", 'those', 'again', "mustn't", "wouldn't", 'mightn', 'isn', 'he', "you'll", "hasn't", 'the', "couldn't", 'about', 've', "hadn't", 'are', 'after', "didn't", 'am', 'do', 'while', "she's", "shan't", 'when', 'couldn', "should've", 'before', 'these', 'shouldn'

In [10]:
list_of_words = []

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [word.lower() for word in words if len(word) > 2 and word not in en_stopwords]
    list_of_words.append(words)

In [11]:
print(list_of_words)

[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepika', 'ranveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'deepika', 'ranveer', 'wedding', 'style', 'file'], ['not', 'ambanis', 'deepika', 'ranveer', 'priyanka', 'nick'], ['man', 'proves', 'wedding', 'the', 'year', 'this', 'year', 'year', 'big', 'fat', 'lavish', 'extravagant', 'weddings'], ['from', 'isha', 'ambani', 'anand', 'piramal', 'deepika', 'padukone', 'ranveer', 'singh', 'priyanka', 'chopra', 'nick', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', '2018', 'saw', 'many', 'grand', 'weddings'], ['but', 'nothing', 'beats', 'man', 'wedding', 'the', 'year', 'award', 'social', 'media'], ['priyanka', 'also', 'shared', 'video', 'featuring', 'nick', 'jonas', 'also', 'celebrating', 't

#### Modeling

In [12]:
from gensim.models import Word2Vec

In [13]:
model = Word2Vec(list_of_words, size = 300, window = 10, min_count = 1)
print(model)

Word2Vec(vocab=115, size=300, alpha=0.025)


In [14]:
words = list(model.wv.vocab)
print(words)

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple', 'from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'style', 'file', 'not', 'ambanis', 'priyanka', 'nick', 'man', 'proves', 'year', 'this', 'big', 'fat', 'lavish', 'extravagant', 'weddings', 'isha', 'ambani', 'anand', 'piramal', 'chopra', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', 'saw', 'many', 'grand', 'but', 'nothing', 'beats', 'award', 'social', 'media', 'shared', 'video', 'featuring', 'celebrating', 'family', 'first', 'celebrated', 'christmas', 'london', 'pictures', 'new', 'outstanding', 'glimpses', 'celebration', 'verbier', 'switzerland', 'married', 'december', 'three', 'receptions', 'delhi', 'mumbai', 'jaggo', 'night', 'made', 'even', 'special', 'industry', 'friends', 'long', 'time', 'ther

In [15]:
print(model["deepika"].shape)

(300,)
