In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import norm
from gensim.models import Word2Vec
from gensim.models import FastText
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

In [2]:
def cosine_similarity(vec1,vec2):
    vec1=np.array(vec1)
    vec2=np.array(vec2)
    return vec1.dot(vec2)/(norm(vec1)*norm(vec2))

In [3]:
def o2h(text):
    return transliterate(text, sanscript.ORIYA, sanscript.DEVANAGARI)

In [4]:
def accuracy(y_pred,y_true):
    return len(y_pred[y_pred==y_true])/len(y_true)

In [5]:
def thresholding(sim,threshold):
    s=[]
    for i in sim:
        if i>threshold:
            s.append(1)
        else:
            s.append(0)
    return np.array(s)

In [6]:
path='or/50/'

In [7]:
file=pd.read_csv('sim.txt')

In [8]:
tokens=file[['Token1','Token2']].values
truth=file['Similar'].values

# Glove

In [9]:
gloveFile=open(path+"glove/or-d50-glove.txt","r",encoding='utf-8')
glove={}
c=1000
for i in gloveFile:
    i=i.split(" ")
    glove[i[0]]=list(map(float,i[1:]))

In [10]:
simGlove=[]
for i in tokens:
    simGlove.append(cosine_similarity(glove[o2h(i[0])],glove[o2h(i[1])]))

In [11]:
file['Glove Similarity']=[i*10 for i in simGlove]

In [12]:
file['Glove 4']=thresholding(simGlove,0.4)
file['Glove 5']=thresholding(simGlove,0.5)
file['Glove 6']=thresholding(simGlove,0.6)
file['Glove 7']=thresholding(simGlove,0.7)
file['Glove 8']=thresholding(simGlove,0.8)

# CBow

In [13]:
cbow = Word2Vec.load(path+"cbow/or-d50-m2-cbow.model")

In [14]:
simCbow=[]
for i in tokens:
    simCbow.append(cosine_similarity(cbow.wv[o2h(i[0])],cbow.wv[o2h(i[1])]))

In [15]:
file['Cbow Similarity']=[i*10 for i in simCbow]

In [16]:
file['Cbow 4']=thresholding(simCbow,0.4)
file['Cbow 5']=thresholding(simCbow,0.5)
file['Cbow 6']=thresholding(simCbow,0.6)
file['Cbow 7']=thresholding(simCbow,0.7)
file['Cbow 8']=thresholding(simCbow,0.8)

# Skip Gram

In [17]:
sg = Word2Vec.load(path+"sg/or-d50-m2-sg.model")

In [18]:
simSg=[]
for i in tokens:
    simSg.append(cosine_similarity(sg.wv[o2h(i[0])],sg.wv[o2h(i[1])]))

In [19]:
file['Sg Similarity']=[i*10 for i in simSg]
file['Sg 4']=thresholding(simSg,0.4)
file['Sg 5']=thresholding(simSg,0.5)
file['Sg 6']=thresholding(simSg,0.6)
file['Sg 7']=thresholding(simSg,0.7)
file['Sg 8']=thresholding(simSg,0.8)

# Fast Text

In [20]:
ft = FastText.load(path+"fasttext/or-d50-m2-fasttext.model")

In [21]:
simFt=[]
for i in tokens:
    simFt.append(cosine_similarity(ft.wv[o2h(i[0])],ft.wv[o2h(i[1])]))

In [22]:
file['Ft Similarity']=[i*10 for i in simFt]
file['Ft 4']=thresholding(simFt,0.4)
file['Ft 5']=thresholding(simFt,0.5)
file['Ft 6']=thresholding(simFt,0.6)
file['Ft 7']=thresholding(simFt,0.7)
file['Ft 8']=thresholding(simFt,0.8)

# Testing

In [23]:
thresholds=[0.4,0.5,0.6,0.7,0.8]
result=open('output/50/accuracy/glove.txt','w')
result.write('---------------GLOVE------------------\n')
result.write('Threhold          Accuracy\n')
for threshold in thresholds:
    result.write(str(threshold)+'          '+str(accuracy(truth,thresholding(simGlove,threshold)))+'\n')
result.close()

In [24]:
thresholds=[0.4,0.5,0.6,0.7,0.8]
result=open('output/50/accuracy/Cbow.txt','w')
result.write('---------------Cbow------------------\n')
result.write('Threhold          Accuracy\n')
for threshold in thresholds:
    result.write(str(threshold)+'          '+str(accuracy(truth,thresholding(simCbow,threshold)))+'\n')
result.close()

In [25]:
thresholds=[0.4,0.5,0.6,0.7,0.8]
result=open('output/50/accuracy/Sg.txt','w')
result.write('---------------Skip Gram------------------\n')
result.write('Threhold          Accuracy\n')
for threshold in thresholds:
    result.write(str(threshold)+'          '+str(accuracy(truth,thresholding(simSg,threshold)))+'\n')
result.close()

In [26]:
thresholds=[0.4,0.5,0.6,0.7,0.8]
result=open('output/50/accuracy/Ft.txt','w')
result.write('---------------FastText------------------\n')
result.write('Threhold          Accuracy\n')
for threshold in thresholds:
    result.write(str(threshold)+'          '+str(accuracy(truth,thresholding(simFt,threshold)))+'\n')
result.close()

In [27]:
thresholds=[4,5,6,7,8]
for i in thresholds:
    col_name='Glove '+str(i)
    filename='output/50/Q1_Glove50_'+str(i)+'.csv'
    temp=file[['Token1','Token2','Glove Similarity','Similar',col_name]]
    temp.columns=['Word 1','Word 2','Similarity Score','Ground Truth','Label']
    temp.to_csv(filename,index=False)

In [28]:
thresholds=[4,5,6,7,8]
for i in thresholds:
    col_name='Cbow '+str(i)
    filename='output/50/Q1_Cbow50_'+str(i)+'.csv'
    temp=file[['Token1','Token2','Cbow Similarity','Similar',col_name]]
    temp.columns=['Word 1','Word 2','Similarity Score','Ground Truth','Label']
    temp.to_csv(filename,index=False)

In [29]:
thresholds=[4,5,6,7,8]
for i in thresholds:
    col_name='Sg '+str(i)
    filename='output/50/Q1_SkipGram50_'+str(i)+'.csv'
    temp=file[['Token1','Token2','Sg Similarity','Similar',col_name]]
    temp.columns=['Word 1','Word 2','Similarity Score','Ground Truth','Label']
    temp.to_csv(filename,index=False)

In [30]:
thresholds=[4,5,6,7,8]
for i in thresholds:
    col_name='Ft '+str(i)
    filename='output/50/Q1_FastText50_'+str(i)+'.csv'
    temp=file[['Token1','Token2','Ft Similarity','Similar',col_name]]
    temp.columns=['Word 1','Word 2','Similarity Score','Ground Truth','Label']
    temp.to_csv(filename,index=False)