In [1]:
'''
Written By: Aadish Joshi
Date: May 04, 2019
Word2Vec model for old scripture Mahabharata
'''

#imports
import multiprocessing
import re
import gensim
import nltk
import pandas as pd
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



In [2]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
stop_words.add('Chapter')

In [3]:
def preprocess(line):
    line = line.lower()
    line = re.sub("[^a-zA-Z]"," ", line)
    tokens = word_tokenize(line)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [4]:
def read_file(filename):
    tokens = []
    with open(filename) as file:
        for line in file:
            token = preprocess(line)
            tokens.extend(token)
            
    return tokens
tokens = read_file("../input/1-18 books combined.txt")

In [5]:
def NER(tokens,no_tag):
    POS_tokens = nltk.pos_tag(tokens)
    mahabharata_tokens = []
    for i in POS_tokens:
        if i[1] not in no_tag:
            mahabharata_tokens.append(i[0])
    return mahabharata_tokens
            
mahabharata_tokens = NER(tokens,['IN','DT','CD'])

In [6]:
model = gensim.models.Word2Vec(
    [mahabharata_tokens],
    size=108,
    window=3,
    min_count=3,
    workers=10, 
    iter=10)

In [7]:
model.wv.most_similar("krishna")

[('rama', 0.9984303712844849),
 ('king', 0.9983454942703247),
 ('bhishma', 0.9983348250389099),
 ('world', 0.9983336925506592),
 ('time', 0.9983187317848206),
 ('earth', 0.9983071088790894),
 ('life', 0.9982994794845581),
 ('take', 0.9982937574386597),
 ('would', 0.9982885718345642),
 ('lord', 0.9982773065567017)]