In [1]:
import numpy as np
import pandas as pd
import os
import re

import torch
import nltk
import pickle
import itertools

from sklearn.neighbors import NearestNeighbors
from collections import Counter

from gensim.models import KeyedVectors
# import gensim.downloader

from utils.functions import compute_scores

In [2]:
model_class = "glove"
num_nearest_neighbours = 3

if model_class == "skip-gram":
    model = KeyedVectors.load("/home/amitgajbhiye/cardiff_work/word2vec-word2vec-google-news-300/w2v_google_news_300")
elif model_class == "glove":
    model = KeyedVectors.load("/home/amitgajbhiye/cardiff_work/glove_word_vectors/glove-wiki-gigaword-300")
    
print (f"Working with ** {model_class} ** vectors...")

Working with ** glove ** vectors...


In [3]:
mcrae_train_df = pd.read_csv("data/evaluation_data/extended_mcrae/train_mcrae.tsv", sep="\t", names=["concept", "property", "label"])
mcrae_test_df = pd.read_csv("data/evaluation_data/extended_mcrae/test_mcrae.tsv", sep="\t", names=["concept", "property", "label"])

print ("McRae Train Df size : ", mcrae_train_df.shape)
print (mcrae_train_df.head())

print ()

print ("McRae Test Df size : ", mcrae_test_df.shape)
print (mcrae_test_df)


McRae Train Df size :  (19258, 3)
      concept          property  label
0      wrench           squishy      0
1  teddy_bear           fragile      0
2  microscope  used by children      1
3      shovel  used for killing      0
4        wall    found on walls      1

McRae Test Df size :  (4813, 3)
         concept          property  label
0         onions    used for music      0
1          pizza             shiny      0
2     motorcycle   eaten in summer      0
3       sailboat              cold      0
4           lime      worn on feet      0
...          ...               ...    ...
4808   snowboard              tall      0
4809        veil           squishy      0
4810    sailboat      light weight      0
4811  skateboard  used for cooking      0
4812    mandarin             a toy      0

[4813 rows x 3 columns]


In [4]:
def create_word_2_vector_dict(df_series):
    
    name_2_vector = {}

    for name in df_series.unique():
        splited_name = re.split('\s|_', name.strip())
        l = []
        for word in splited_name:
            try :
                l.append(model[word])
            except KeyError:
                print (f"{word} not found in {model_class}... initialsing with zeros")
                l.append(np.zeros(300,))
        
        avg = np.average(l, axis=0) # avergaing the vectors of the words in the name
        avg = avg / np.linalg.norm(avg) # normalising the vector 
        
        name_2_vector[name] = np.average(l, axis=0) 
    
    return name_2_vector



In [5]:
train_test_df = pd.concat([mcrae_train_df, mcrae_test_df])

print (train_test_df.shape)
train_test_df.head()

(24071, 3)


Unnamed: 0,concept,property,label
0,wrench,squishy,0
1,teddy_bear,fragile,0
2,microscope,used by children,1
3,shovel,used for killing,0
4,wall,found on walls,1


In [6]:
con2vec = create_word_2_vector_dict(train_test_df["concept"])

In [7]:
prop2vec = create_word_2_vector_dict(train_test_df["property"])

In [8]:
print (len(con2vec.keys()))
# print (con2vec.keys())

514


In [9]:
print (len(prop2vec.keys()))
# print (prop2vec.keys())

50


In [10]:
def predict_label(train_cons_similar_to_test, train_props_similar_to_test, train_df, test_df):

    preds = []

    for index, row in test_df.iterrows():
        
        # print ()
        # print ("Index :", index)
        test_con, test_prop, test_label = row["concept"], row["property"], row["label"]
        
        train_similar_props = train_props_similar_to_test.get(test_prop)
        
        assert train_similar_props is not None, "No Train Similar Properties for the Test Property"

#         print ("Test Data :", index, test_con, test_prop, test_label)
        
#         print ("Properties Similar to test property in Train File")
#         print (train_similar_props)
        
        # print ("***************** Concept Processing Starts ***************** ")

        train_similar_concepts = train_cons_similar_to_test.get(test_con)
        
        assert train_similar_concepts is not None, "No Train Similar Concepts for the Test Concept"

        # print ("Concepts Similar to test concept in Train File")
        # print (train_similar_concepts)
        
        combination = list(itertools.product(train_similar_concepts, train_similar_props))
        
        # print ("Combination of Similar Concept and Similar Property")
        # print (combination)
        
        label_list = []
        
        for con, prop in combination:
            df = train_df.loc[(train_df["concept"] == con) & (train_df["property"] == prop) & (train_df["label"] == 1)]
            # print (f"Dataframe {con} : {prop} is Empty {df.empty}")
        
            if df.empty:
                label_list.append(0)
            else:
                label_list.append(1)
                
        # print ("label_list")
        # print (label_list)
        
        if num_nearest_neighbours == 3:
            threshold = 5
        elif num_nearest_neighbours == 1:
            threshold = 1
            
        label_sum = sum(label_list)
        
        # print ("label_sum :", label_sum)
        # print ("threshold :", threshold)
        
        if label_sum >= threshold:
            test_pred = 1
        else:
            test_pred = 0
        
        # print ("test_pred :", test_pred)
        
        preds.append(test_pred)
    
    return preds


In [11]:

# local path for Con Prop split:
local_base_path = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_con_prop_split_train_test_files"

all_gold_labels, all_preds  = [], []

for x in range(9):
    
    print (f"For Fold {x}")
    train_file_path = os.path.join(local_base_path, f"{x}_train_con_prop_split_con_prop.pkl")
    test_file_path = os.path.join(local_base_path, f"{x}_test_con_prop_split_con_prop.pkl")
    
    print (train_file_path)
    print (test_file_path)
    print ()
    
    with open (train_file_path, "rb") as train_file, open (test_file_path, "rb") as test_file:
        train_df = pickle.load(train_file)
        test_df = pickle.load(test_file)
    
    print (f"Train Df shape : {train_df.shape}, {train_df.columns}")
    print (f"Test Df shape : {test_df.shape}, {test_df.columns}")
    
    train_concept = train_df["concept"].unique()
    
    name_train_concept = train_concept
    vectors_train_concept = [con2vec[con] for con in name_train_concept]
    
    print (f"num train concept : {len(name_train_concept)}")
    print (f"vector train concept : {len(vectors_train_concept)}")
    
    train_prop = train_df["property"].unique()
    
    name_train_prop = train_prop
    vectors_train_prop = [prop2vec[prop] for prop in name_train_prop]
    
    print (f"num train property : {len(name_train_prop)}")
    print (f"vector train property : {len(vectors_train_prop)}")

    test_concept = test_df["concept"].unique()
    
    name_test_concept = test_concept
    vectors_test_concept = [con2vec[con] for con in name_test_concept]
    
    print (f"num test concept : {len(name_test_concept)}")
    print (f"vector test concept : {len(vectors_test_concept)}")
    
    test_prop = test_df["property"].unique()
    
    name_test_prop = test_prop
    vectors_test_prop = [prop2vec[prop] for prop in name_test_prop]
    
    print (f"num test property : {len(name_test_prop)}")
    print (f"vector test property : {len(vectors_test_prop)}")
    
    print ()
    
    print (f"#Unique Train Concepts : {len(train_concept)}")
    print (f"#Unique Train Property : {len(train_prop)}")
    
    print ()
    print (f"#Unique Test Concepts : {len(test_concept)}")
    print (f"#Unique Test Property : {len(test_prop)}")
    
    print()
    print (f"Concept Intersection : {len(set(train_concept).intersection(test_concept))}")
    print (f"Property Intersection : {len(set(train_prop).intersection(test_prop))}")
    print ()

    # print (" **************************** Concept Nearest Neighbours ****************************")
    
    train_con_nbrs = NearestNeighbors(n_neighbors=num_nearest_neighbours, algorithm='brute', metric='euclidean').fit(np.array(vectors_train_concept))
    con_test_distances, con_test_indices = train_con_nbrs.kneighbors(np.array(vectors_test_concept))
    
    train_cons_similar_to_test = {}
    
    for idx, con in zip(con_test_indices, name_test_concept):

        train_cons_similar_to_test[con] = [name_train_concept[con_id] for con_id in idx]
        # print (f"{con} : {train_cons_similar_to_test[con]}")
    
    
    # print (" **************************** property Nearest Neighbours ****************************")
    
    
    train_prop_nbrs = NearestNeighbors(n_neighbors=num_nearest_neighbours, algorithm='brute', metric='euclidean').fit(np.array(vectors_train_prop))
    prop_test_distances, prop_test_indices = train_prop_nbrs.kneighbors(np.array(vectors_test_prop))
    
    train_props_similar_to_test = {}
    
    for idx, prop in zip(prop_test_indices, name_test_prop):
        
        train_props_similar_to_test[prop] = [name_train_prop[prop_id] for prop_id in idx]
        # print (f"{prop} : {train_props_similar_to_test[prop]}")
    
    
    gold_label_for_fold = test_df["label"].values
    pred_for_fold = predict_label(train_cons_similar_to_test, train_props_similar_to_test, train_df, test_df)
    
    all_gold_labels.extend(gold_label_for_fold)
    all_preds.extend(pred_for_fold)
    
    
all_gold_labels = np.array(all_gold_labels)
all_preds = np.array(all_preds)

print ("Finished")


For Fold 0
/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_con_prop_split_train_test_files/0_train_con_prop_split_con_prop.pkl
/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_con_prop_split_train_test_files/0_test_con_prop_split_con_prop.pkl

Train Df shape : (10609, 3), Index(['concept', 'property', 'label'], dtype='object')
Test Df shape : (2713, 3), Index(['concept', 'property', 'label'], dtype='object')
num train concept : 342
vector train concept : 342
num train property : 33
vector train property : 33
num test concept : 172
vector test concept : 172
num test property : 17
vector test property : 17

#Unique Train Concepts : 342
#Unique Train Property : 33

#Unique Test Concepts : 172
#Unique Test Property : 17

Concept Intersection : 0
Property Intersection : 0

For Fold 1
/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_con_prop_split_trai

In [12]:
len(all_preds)

24071

In [13]:
len(all_gold_labels)

24071

In [14]:
assert len(all_gold_labels) == len(all_preds)

In [15]:
print (Counter(all_preds))
print (Counter(all_gold_labels))

Counter({0: 23400, 1: 671})
Counter({0: 19829, 1: 4242})


In [16]:
results = compute_scores(all_gold_labels, all_preds)

In [17]:
print ()
print ("Concept Property Split")
print (f"NN Classifier with {model_class}")
print (f"Nearest Neighbours Considered : {num_nearest_neighbours}")
print ()

for key, value in results.items():
    print (key, value)


Concept Property Split
NN Classifier with glove
Nearest Neighbours Considered : 3

binary_f1 0.0493
micro_f1 0.8059
macro_f1 0.4706
weighted_f1 0.7434
accuracy 0.8059
classification report               precision    recall  f1-score   support

           0       0.82      0.97      0.89     19829
           1       0.18      0.03      0.05      4242

    accuracy                           0.81     24071
   macro avg       0.50      0.50      0.47     24071
weighted avg       0.71      0.81      0.74     24071

confusion matrix [[19279   550]
 [ 4121   121]]
