In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from tqdm import tqdm

# Creating L1

In [2]:
import codecs,string
def is_hindi(character):
    maxchar = max(character)
    if u'\u0900' <= maxchar <= u'\u097f':
        return True
    else:
        return False

In [3]:
english_BingLiu = pd.read_csv('BingLiu.csv', sep='\t', header=None, names=['english', 'polarity']) 
english_BingLiu.head()

Unnamed: 0,english,polarity
0,a+,positive
1,abound,positive
2,abounds,positive
3,abundance,positive
4,abundant,positive


In [4]:
file = open('english-hindi-dictionary.txt', 'r', encoding="utf8") 
english_hindi_dictionary = {}
while True: 
    line = file.readline() 
    if not line: 
        break
    line = line.strip('\n')
    line = line.split(' ||| ')
    english = line[0]
    hindi = line[1]
    try:
        if english not in english_hindi_dictionary and is_hindi(hindi):
            english_hindi_dictionary[english] = hindi
    except ValueError:
        pass
file.close()

In [5]:
len(english_hindi_dictionary)

16499

In [6]:
english_BingLiu["hindi"] = pd.Series(dtype=object)

for index, row in english_BingLiu.iterrows():
    if row['english'] in english_hindi_dictionary:
        english_BingLiu['hindi'][index] = english_hindi_dictionary[row['english']]

## L1

In [7]:
print("L1:")
L1 = english_BingLiu.dropna().reset_index()
L1

L1:


Unnamed: 0,index,english,polarity,hindi
0,4,abundant,positive,हुस्न
1,6,accessible,positive,सुलभ
2,14,accomplish,positive,पूरा
3,15,accomplished,positive,पूरा हुआ
4,16,accomplishment,positive,उपलब्धि
5,17,accomplishments,positive,उपलब्धियों
6,18,accurate,positive,सटीक
7,19,accurately,positive,यथासंभव
8,21,achievement,positive,उपलब्धि
9,22,achievements,positive,उपलब्धियाँ


# Data preprocessing

In [8]:
file = open('english.txt', 'r') 
english_sentences = []
while True: 
    line = file.readline() 
    if not line: 
        break
    line = word_tokenize(line)
    english_sentences.append(line)
file.close()

In [9]:
english_sentences = [[x.lower() for x in elt] for elt in english_sentences]
english_sentences[:1]

[['judging',
  'from',
  'previous',
  'posts',
  'this',
  'used',
  'to',
  'be',
  'a',
  'good',
  'place',
  ',',
  'but',
  'not',
  'any',
  'longer',
  '.']]

In [10]:
file = open('hindi.txt', 'r',encoding="utf8") 
hindi_sentences = []
while True: 
    line = file.readline() 
    if not line: 
        break
    line = word_tokenize(line)
    hindi_sentences.append(line)
file.close()

In [11]:
hindi_sentences[:1]

[['फेसबुक',
  'का',
  'सिक्योरिटी',
  'चैकअप',
  'फीचर',
  'पॉपअप',
  'की',
  'तरह',
  'यूजर्स',
  'को',
  'दिखाइ',
  'देगा',
  '।']]

# Word2Vec

## Single Word2vec
Results in 1-3 additions to L1

In [15]:
word2vec_english_model = Word2Vec(sentences=english_sentences, size=100, window=5, min_count=1, workers=4)

In [13]:
word2vec_english_model.save("word2vec_english_model.model")

In [16]:
word2vec_hindi_model = Word2Vec(sentences=hindi_sentences, size=100, window=5, min_count=1, workers=4)

In [15]:
word2vec_hindi_model.save("word2vec_hindi_model.model")

In [38]:
L1_w2v = english_BingLiu.dropna()
L1_w2v.shape

(2113, 3)

In [35]:
index = 0
while True:
    # print(index, len(L1_w2v))
    if index >= len(L1_w2v):
        break
    row = L1_w2v.iloc[index]
    english_word = row['english']
    hindi_word = row['hindi']
    polarity = row['polarity']
    
    try:
        english_closest = word2vec_english_model.wv.most_similar(positive=[english_word], topn=5)
    except KeyError:
        index += 1
        continue
    try:
        hindi_closest = word2vec_hindi_model.wv.most_similar(positive=[hindi_word], topn=5)
    except KeyError:
        index += 1
        continue
        
    english = []
    hindi = []
    for elt in english_closest:
        english.append(elt[0])
    for elt in hindi_closest:
        hindi.append(elt[0])

    for eng in english:
        for hin in hindi:
            if eng in english_hindi_dictionary and english_hindi_dictionary[eng] == hin and eng not in list(L1_w2v['english']):
                print("Added new row in L1_w2v")
                new_row = {'english':eng, 'polarity':polarity, 'hindi':hin}
                L1_w2v = L1_w2v.append(new_row, ignore_index=True)
    index += 1

Added new row in L1_w2v


In [36]:
print("Additions in L1_w2v:")
L1_w2v.iloc[2113:]

Additions in L1_w2v:


Unnamed: 0,english,polarity,hindi
2113,in,positive,में


## Multiple Word2vec

In [37]:
L1_w2v = english_BingLiu.dropna()
L1_w2v.shape

(2113, 3)

In [39]:
def addToLexicon(eng_model, hind_model, L1):
    index = 0
    while True:
        # print(index, len(L1))
        if index >= len(L1):
            break
        row = L1.iloc[index]
        english_word = row['english']
        hindi_word = row['hindi']
        polarity = row['polarity']
        
        try:
            english_closest = eng_model.wv.most_similar(positive=[english_word], topn=5)
        except KeyError:
            index += 1
            continue
        try:
            hindi_closest = hind_model.wv.most_similar(positive=[hindi_word], topn=5)
        except KeyError:
            index += 1
            continue
        
        english = []
        hindi = []
        for elt in english_closest:
            english.append(elt[0])
        for elt in hindi_closest:
            hindi.append(elt[0])

        for eng in english:
            for hin in hindi:
                if eng in english_hindi_dictionary and english_hindi_dictionary[eng] == hin and eng not in list(L1['english']):
                    new_row = {'english':eng, 'polarity':polarity, 'hindi':hin}
                    L1 = L1.append(new_row, ignore_index=True)
        index += 1
    return L1

In [16]:
for i in tqdm(range(2,19)):
    word2vec_english_model = Word2Vec(sentences=english_sentences, size=100, window=i, min_count=1, workers=4)
    word2vec_hindi_model = Word2Vec(sentences=hindi_sentences, size=100, window=i, min_count=1, workers=4)
    word2vec_english_model.save("models/word2vec_english_model_ws_"+str(i)+".model")
    word2vec_hindi_model.save("models/word2vec_hindi_model_ws_"+str(i)+".model")
    p = L1_w2v.shape[0]
    L1_w2v = addToLexicon(word2vec_english_model, word2vec_hindi_model, L1_w2v)
    n = L1_w2v.shape[0]
    print("Window size:", i, "Additions:", n-p)
L1

  6%|▌         | 1/17 [00:05<01:35,  5.99s/it]Window size: 2 Additions: 3
 12%|█▏        | 2/17 [00:11<01:27,  5.82s/it]Window size: 3 Additions: 2
 18%|█▊        | 3/17 [00:16<01:20,  5.73s/it]Window size: 4 Additions: 3
 24%|██▎       | 4/17 [00:22<01:14,  5.70s/it]Window size: 5 Additions: 3
 29%|██▉       | 5/17 [00:27<01:07,  5.60s/it]Window size: 6 Additions: 2
 35%|███▌      | 6/17 [00:33<01:02,  5.65s/it]Window size: 7 Additions: 1
 41%|████      | 7/17 [00:39<00:56,  5.64s/it]Window size: 8 Additions: 3
 47%|████▋     | 8/17 [00:44<00:50,  5.64s/it]Window size: 9 Additions: 3
 53%|█████▎    | 9/17 [00:50<00:45,  5.69s/it]Window size: 10 Additions: 2
 59%|█████▉    | 10/17 [00:56<00:39,  5.70s/it]Window size: 11 Additions: 1
 65%|██████▍   | 11/17 [01:02<00:34,  5.70s/it]Window size: 12 Additions: 0
 71%|███████   | 12/17 [01:07<00:28,  5.69s/it]Window size: 13 Additions: 1
 76%|███████▋  | 13/17 [01:13<00:22,  5.62s/it]Window size: 14 Additions: 3
 82%|████████▏ | 14/17 [01:18

Unnamed: 0,english,polarity,hindi
0,abundant,positive,हुस्न
1,accessible,positive,सुलभ
2,accomplish,positive,पूरा
3,accomplished,positive,पूरा हुआ
4,accomplishment,positive,उपलब्धि
5,accomplishments,positive,उपलब्धियों
6,accurate,positive,सटीक
7,accurately,positive,यथासंभव
8,achievement,positive,उपलब्धि
9,achievements,positive,उपलब्धियाँ


In [39]:
print("Words added to L1:")
L1_w2v.iloc[2113:]

Words added to L1:


Unnamed: 0,english,polarity,hindi
2113,too,positive,भी
2114,usb,positive,यूएसबी
2115,much,positive,ज्यादा
2116,here,positive,यहां
2117,or,positive,या
2118,because,positive,क्योंकि
2119,no,negative,कोई
2120,that,positive,वह
2121,from,positive,से
2122,particular,positive,विशेष


In [36]:
L_save = L1_w2v.sort_values('english')
L1_w2v.to_csv('results/L1_w2v.csv', columns=['english', 'hindi', 'polarity'], index=False)
L_save.to_csv('results/L1_w2v_sorted.csv', columns=['english', 'hindi', 'polarity'], index=False)
L1_w2v.shape

(2143, 3)

# Glove

In [23]:
L1_glove = english_BingLiu.dropna()
L1_glove.shape

(2113, 3)

In [21]:
glove_file = 'english_glove_vectors.txt'
tmp_file = "glove_english_model.txt"

_ = glove2word2vec(glove_file, tmp_file)
glove_english_model = KeyedVectors.load_word2vec_format("glove_english_model.txt")

In [26]:
glove_english_model.save("glove_english_model.model")

In [22]:
glove_file = 'hindi_glove_vectors.txt'
tmp_file = "glove_hindi_model.txt"

_ = glove2word2vec(glove_file, tmp_file)
glove_hindi_model = KeyedVectors.load_word2vec_format("glove_hindi_model.txt")

In [27]:
glove_hindi_model.save("glove_hindi_model.model")

In [24]:
index = 0
while True:
    if index >= len(L1_glove):
        break
    row = L1_glove.iloc[index]
    english_word = row['english']
    hindi_word = row['hindi']
    polarity = row['polarity']
    
    try:
        english_closest = glove_english_model.most_similar(positive=[english_word], topn=5)
    except KeyError:
        index += 1
        continue
    try:
        hindi_closest = glove_hindi_model.most_similar(positive=[hindi_word], topn=5)
    except KeyError:
        index += 1
        continue
        
    english = []
    hindi = []
    for elt in english_closest:
        english.append(elt[0])
    for elt in hindi_closest:
        hindi.append(elt[0])
    
    for eng in english:
        for hin in hindi:
            if eng in english_hindi_dictionary and english_hindi_dictionary[eng] == hin and eng not in list(L1_glove['english']):
                new_row = {'english':eng, 'polarity':polarity, 'hindi':hin}
                L1_glove = L1_glove.append(new_row, ignore_index=True)
                print("Addition to L1")
    index += 1

Addition to L1


In [25]:
print("Words added to L1:")
L1_glove.iloc[2113:]

Words added to L1:


Unnamed: 0,english,polarity,hindi
2113,on,positive,पर


In [28]:
L_save = L1_glove.sort_values('english')
L1_glove.to_csv('results/L1_glove.csv', columns=['english', 'hindi', 'polarity'], index=False)
L_save.to_csv('results/L1_glove_sorted.csv', columns=['english', 'hindi', 'polarity'], index=False)
L1_glove.shape

(2114, 3)