# Stan Models

In [None]:
import os
import numpy as np
import cmdstanpy
cmdstanpy.install_cmdstan()
from cmdstanpy import cmdstan_path, CmdStanModel
from sklearn.manifold import TSNE
import pickle
import matplotlib.pyplot as plt
!gdown https://drive.google.com/uc?id=1G-FORBywVm4dt7XqlQaf9s2dlvlmCt7f&export=download #downloading data json

In [None]:
#Some necessary files
!gdown https://drive.google.com/uc?id=1dwJMGK1wjG6539qt3-8LXCgKrbUlNaev&export=download
!gdown https://drive.google.com/uc?id=14nSL_wJHt97PmtI92-QQalpNGYwzBy6V&export=download
!gdown https://drive.google.com/uc?id=1q4JXp1HsazO_4vgl4hgiPC2jWVl9wHoE&export=download
!gdown https://drive.google.com/uc?id=16qkjRKtDGzolywt1SNS6OsnR7rNQqdjn&export=download

word_list=pickle.load(open("word_list.pkl", "rb"))
X=pickle.load(open("embedding_matrix.pkl", "rb"))

word_pair = pickle.load(open("word_pairs.pkl", "rb"))
embd_pairs = pickle.load(open("pairs.pkl", "rb"))

**Important**: If you want to run a model from scratch, change run_model_x to True, the stan files will be downloaded and inference from scratch will be performed. Else it will download the learnt parameters.

In [None]:
run_model_1 = True
run_model_2 = False
run_model_3 = True #takes about 30 mins to run

if run_model_1 :
    !gdown https://drive.google.com/uc?id=10vXwK-G5TWrU7N67kpNRoDHkCUL4x5XT&export=download
    stan = 'model1.stan'
    model = CmdStanModel(stan_file=stan)
    model.name
    model.stan_file
    model.exe_file
    model.code()

    data = 'data.json'
    variational_vb = model.variational(data=data, output_dir='.',save_diagnostics=True)

    a=[]
    for i in range(1,101):
        a.append(variational_vb.variational_params_dict['a.'+str(i)])
    np.save('a.npy',a)

else:
    !gdown https://drive.google.com/uc?id=1hSCN73ruVDKRRKl0suz4J5W_ECZN1l3b&export=download
a=np.load('a.npy')
    

if run_model_3 :
    !gdown https://drive.google.com/uc?id=1zz2LvKkOud7TrttjNnI4M9051ZjCgmEP&export=download
    stan = 'model3.stan'
    model = CmdStanModel(stan_file=stan)
    model.name
    model.stan_file
    model.exe_file
    model.code()

    data = 'data.json'
    variational_vb = model.variational(data=data, output_dir='.',save_diagnostics=True)

    a = np.zeros((100,100))
    for i in range(1,101):
        for j in range(1,101):
            a[i-1,j-1]=variational_vb.variational_params_dict['a.'+str(i)+'.'+str(j)]
    np.save('a_model3.npy',a)

else:
    !gdown https://drive.google.com/uc?id=1lSGFlhjSLK_vopHCH8dGCiDxEjbqX7IX&export=download
A=np.load('a_model3.npy')

#Threshold based classification

In [None]:
X= embd_pairs[:,:200]
y= embd_pairs[:,200]
A=A.T

In [None]:
# GloVe
y_ = (X[:,0:int(X.shape[1]/2)]*X[:,int(X.shape[1]/2):]).sum(axis=1)/(np.linalg.norm(X[:,0:int(X.shape[1]/2)],axis=1)*np.linalg.norm(X[:,int(X.shape[1]/2):],axis=1))
y_ = (y_ >=0).astype(int)
print('Glove gives an accuracy of', np.sum(y==y_)/len(y))

#Model 1
a_double = np.expand_dims(np.hstack((a,a)),axis=1).T
X_new = X*a_double
y__ = (X_new[:,0:int(X.shape[1]/2)]*X_new[:,int(X.shape[1]/2):]).sum(axis=1)/(np.linalg.norm(X_new[:,0:int(X.shape[1]/2)],axis=1)*np.linalg.norm(X_new[:,int(X.shape[1]/2):],axis=1))
y__ = (y__ >=0).astype(int)
print('Model 1 gives an accuracy of', np.sum(y==y__)/len(y))

# Model 3
temp1= X[:,:100]
temp2 = X[:,100:]
X_new = np.hstack((np.matmul(temp1,A),np.matmul(temp2,A)))
y__ = (X_new[:,0:int(X.shape[1]/2)]*X_new[:,int(X.shape[1]/2):]).sum(axis=1)/(np.linalg.norm(X_new[:,0:int(X.shape[1]/2)],axis=1)*np.linalg.norm(X_new[:,int(X.shape[1]/2):],axis=1))
y__ = (y__ >=0).astype(int)
print('Model 3 gives an accuracy of', np.sum(y==y__)/len(y))

#t-sne

In [None]:
run_tsne = False # download already run embeddings if False

In [None]:
def transform_model1(W):
    a=np.load('a.npy') #this will be column vector
    W_new = W*np.expand_dims(a,axis=1).T
    return W_new

def transfrom_model2(W):
    a=np.load() #this will be column vector
    s=np.load()
    
    W_a = W*np.expand_dims(a,axis=1).T
    W_s = W*np.expand_dims(s,axis=1).T
    
    return W_a,W_s

def transform_model3(W):
    A= np.load('a_model3.npy')
    A=A.T
    return np.matmul(W,A)



if run_tsne:
    np.save('glove_X_embedding.npy',TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(X))
    np.save('model1_X_embedding.npy',TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(transform_model1(X)))
    np.save('model3_X_embedding.npy',TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(transform_model1(X)))
else:
    !gdown https://drive.google.com/uc?id=12a_310_fv73muAfIabFR3vK63YG84PhX&export=download
    !gdown https://drive.google.com/uc?id=1sWX_nyRI66DLWIbPz4WbCeLQhEsY1uif&export=download
    !gdown https://drive.google.com/uc?id=1RMo2q_lETRqjZlEJetJXZU3G3GV-JZbn&export=download

X_embedded_1 = np.load('model1_X_embedding.npy')
X_embedded_3 = np.load('model3_X_embedding.npy')
X_embedded_glove = np.load('glove_X_embedding.npy')

X_embedded_list = [X_embedded_glove, X_embedded_1,X_embedded_3]

indices =[4,6,9,13,17] #Indices of pairs to plot

for X_embedded in X_embedded_list:

    select_word_pairs = [word_pair[i][0] for i in indices]+ [word_pair[i][1] for i in indices]
    select_word_indices = [word_list.index(word) for word in select_word_pairs]
    select_word_embeddings = np.array([X_embedded[i,:] for i in select_word_indices])
    
    plt.scatter(select_word_embeddings[:,0],select_word_embeddings[:,1],linewidths=1,color='blue')
    plt.grid()
    plt.xlabel("PC1",size=15)
    plt.ylabel("PC2",size=15)
    plt.title("Word Embedding Space",size=20)
    
    for i, word in enumerate(select_word_pairs):
        plt.annotate(word,xy=(select_word_embeddings[i,0],select_word_embeddings[i,1]))
    for i in range(select_word_embeddings.shape[0]):
        plt.arrow(0,0,select_word_embeddings[i,0],select_word_embeddings[i,1])
    
    plt.show()