In [1]:
import numpy as np
import pandas as pd
import os
import re

import torch
import nltk
import pickle

from sklearn.neighbors import NearestNeighbors
from collections import Counter

from gensim.models import KeyedVectors
# import gensim.downloader

from utils.functions import compute_scores

In [2]:
model_class = "skip-gram"
num_nearest_neighbours = 3

if model_class == "skip-gram":
    model = KeyedVectors.load("/home/amitgajbhiye/cardiff_work/word2vec-word2vec-google-news-300/w2v_google_news_300")
elif model_class == "glove":
    model = KeyedVectors.load("/home/amitgajbhiye/cardiff_work/glove_word_vectors/glove-wiki-gigaword-300")
    
print (f"Working with ** {model_class} ** vectors...")

Working with ** skip-gram ** vectors...


In [3]:
mcrae_train_df = pd.read_csv("data/evaluation_data/extended_mcrae/train_mcrae.tsv", sep="\t", names=["concept", "property", "label"])
mcrae_test_df = pd.read_csv("data/evaluation_data/extended_mcrae/test_mcrae.tsv", sep="\t", names=["concept", "property", "label"])

print ("McRae Train Df size : ", mcrae_train_df.shape)
print (mcrae_train_df.head())

print ()

print ("McRae Test Df size : ", mcrae_test_df.shape)
print (mcrae_test_df)


McRae Train Df size :  (19258, 3)
      concept          property  label
0      wrench           squishy      0
1  teddy_bear           fragile      0
2  microscope  used by children      1
3      shovel  used for killing      0
4        wall    found on walls      1

McRae Test Df size :  (4813, 3)
         concept          property  label
0         onions    used for music      0
1          pizza             shiny      0
2     motorcycle   eaten in summer      0
3       sailboat              cold      0
4           lime      worn on feet      0
...          ...               ...    ...
4808   snowboard              tall      0
4809        veil           squishy      0
4810    sailboat      light weight      0
4811  skateboard  used for cooking      0
4812    mandarin             a toy      0

[4813 rows x 3 columns]


In [4]:
def create_word_2_vector_dict(df_series):
    
    name_2_vector = {}

    for name in df_series.unique():
        splited_name = re.split('\s|_', name.strip())
        l = []
        for word in splited_name:
            try :
                l.append(model[word])
            except KeyError:
                print (f"{word} not found in {model_class}... initialsing with zeros")
                l.append(np.zeros(300,))
        
        avg = np.average(l, axis=0) # avergaing the vectors of the words in the name
        avg = avg / np.linalg.norm(avg) # normalising the vector 
        
        name_2_vector[name] = np.average(l, axis=0) 
    
    return name_2_vector



In [5]:
train_test_df = pd.concat([mcrae_train_df, mcrae_test_df])

print (train_test_df.shape)
train_test_df.head()

(24071, 3)


Unnamed: 0,concept,property,label
0,wrench,squishy,0
1,teddy_bear,fragile,0
2,microscope,used by children,1
3,shovel,used for killing,0
4,wall,found on walls,1


In [6]:
con2vec = create_word_2_vector_dict(train_test_df["concept"])

axe not found in skip-gram... initialsing with zeros
armour not found in skip-gram... initialsing with zeros


  avg = avg / np.linalg.norm(avg) # normalising the vector


In [7]:
prop2vec = create_word_2_vector_dict(train_test_df["property"])

a not found in skip-gram... initialsing with zeros
a not found in skip-gram... initialsing with zeros


In [8]:
print (len(con2vec.keys()))
# print (con2vec.keys())

514


In [9]:
print (len(prop2vec.keys()))
# print (prop2vec.keys())

50


In [10]:
def predict_label(train_props_similar_to_test, train_df, test_df):

    preds = []

    for index, row in test_df.iterrows():
        
        # print ()
        # print ("Index :", index)
        test_con, test_prop, test_label = row["concept"], row["property"], row["label"]
        
        train_similar_props = train_props_similar_to_test.get(test_prop)
        
        assert train_similar_props is not None, "No Train Similar Properties for the Test Property"

#         print ("Test Data :", index, test_con, test_prop, test_label)
        
#         print ("Properties Similar to test property in Train File")
#         print (train_similar_props)

        positive_con = []

        for train_prop in train_similar_props:
            
            positive_concept = train_df.loc[(train_df["property"] == train_prop) & (train_df["label"] == 1)]["concept"].tolist()
            positive_con.extend(positive_concept)
            
            # print (f"Positive Concept for the similar property: {train_prop}")
            # print (positive_concept)
            
        # print ("All positive Concepts")
        # print (positive_con)

        con_dict  = dict(Counter(positive_con))
        max_con_count = max(con_dict.values())

        # print (con_dict)
        # print (max_con_count)

        con_with_max_count = [] 
        
        for con, count in con_dict.items():
            if count == max_con_count:
                con_with_max_count.append(con)

        
#         print ("con_with_max_count :", con_with_max_count)

#         print ("Test Con :", test_con)
#         print ("con_with_max_count :", con_with_max_count)
#         print ("test_con in con_with_max_count :", test_con in con_with_max_count)

        if test_con in con_with_max_count:
            test_pred = 1
        else:
            test_pred = 0

        preds.append(test_pred)
        
        # print ("test_pred :", test_pred)
        # print()
    
    print (f"All Preds for fold : {len(preds)}")
    # print (f"All Preds for fold : {preds}")
    return preds


In [11]:

# local path for property split:
local_base_path = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_prop_split_train_test_files"


all_gold_labels, all_preds  = [], []

for x in range(5):
    
    print (f"For Fold {x}")
    train_file_path = os.path.join(local_base_path, f"{x}_train_prop_split_con_prop.pkl")
    test_file_path = os.path.join(local_base_path, f"{x}_test_prop_split_con_prop.pkl")
    
    print (train_file_path)
    print (test_file_path)
    print ()
    
    with open (train_file_path, "rb") as train_file, open (test_file_path, "rb") as test_file:
        train_df = pickle.load(train_file)
        test_df = pickle.load(test_file)
    
    print (f"Train Df shape : {train_df.shape}, {train_df.columns}")
    print (f"Test Df shape : {test_df.shape}, {test_df.columns}")
    
    train_concept = train_df["concept"].unique()
    
    name_train_concept = train_concept
    vectors_train_concept = [con2vec[con] for con in name_train_concept]
    
    print (f"num train concept : {len(name_train_concept)}")
    print (f"vector train concept : {len(vectors_train_concept)}")
    
    train_prop = train_df["property"].unique()
    
    name_train_prop = train_prop
    vectors_train_prop = [prop2vec[prop] for prop in name_train_prop]
    
    print (f"num train property : {len(name_train_prop)}")
    print (f"vector train property : {len(vectors_train_prop)}")

    test_concept = test_df["concept"].unique()
    
    name_test_concept = test_concept
    vectors_test_concept = [con2vec[con] for con in name_test_concept]
    
    print (f"num test concept : {len(name_test_concept)}")
    print (f"vector test concept : {len(vectors_test_concept)}")
    
    test_prop = test_df["property"].unique()
    
    name_test_prop = test_prop
    vectors_test_prop = [prop2vec[prop] for prop in name_test_prop]
    
    print (f"num test property : {len(name_test_prop)}")
    print (f"vector test property : {len(vectors_test_prop)}")
    
    print ()
    
    print (f"#Unique Train Concepts : {len(train_concept)}")
    print (f"#Unique Train Property : {len(train_prop)}")
    
    print ()
    print (f"#Unique Test Concepts : {len(test_concept)}")
    print (f"#Unique Test Property : {len(test_prop)}")
    
    print()
    print (f"Concept Intersection : {len(set(train_concept).intersection(test_concept))}")
    print (f"Property Intersection : {len(set(train_prop).intersection(test_prop))}")
    print ()
            
    
    train_prop_nbrs = NearestNeighbors(n_neighbors=num_nearest_neighbours, algorithm='brute', metric='euclidean').fit(np.array(vectors_train_prop))
    prop_test_distances, prop_test_indices = train_prop_nbrs.kneighbors(np.array(vectors_test_prop))
    
    train_props_similar_to_test = {}
    
    for idx, prop in zip(prop_test_indices, name_test_prop):
        
        train_props_similar_to_test[prop] = [name_train_prop[prop_id] for prop_id in idx]
        print (f"{prop} : {train_props_similar_to_test[prop]}")
    
    
    gold_label_for_fold = test_df["label"].values
    pred_for_fold = predict_label(train_props_similar_to_test, train_df, test_df)
    
    all_gold_labels.extend(gold_label_for_fold)
    all_preds.extend(pred_for_fold)
    
    
all_gold_labels = np.array(all_gold_labels)
all_preds = np.array(all_preds)

print ("Finished")


For Fold 0
/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_prop_split_train_test_files/0_train_prop_split_con_prop.pkl
/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_prop_split_train_test_files/0_test_prop_split_con_prop.pkl

Train Df shape : (19273, 3), Index(['concept', 'property', 'label'], dtype='object')
Test Df shape : (4798, 3), Index(['concept', 'property', 'label'], dtype='object')
num train concept : 514
vector train concept : 514
num train property : 40
vector train property : 40
num test concept : 514
vector test concept : 514
num test property : 10
vector test property : 10

#Unique Train Concepts : 514
#Unique Train Property : 40

#Unique Test Concepts : 514
#Unique Test Property : 10

Concept Intersection : 514
Property Intersection : 0

used for eating : ['used for cooking', 'used for holding things', 'used for cleaning']
has peel : ['used for holding things', 'used for cook

In [12]:
len(all_preds)

24071

In [13]:
len(all_gold_labels)

24071

In [14]:
assert len(all_gold_labels) == len(all_preds)

In [15]:
print (Counter(all_preds))
print (Counter(all_gold_labels))

Counter({0: 23297, 1: 774})
Counter({0: 19829, 1: 4242})


In [16]:
results = compute_scores(all_gold_labels, all_preds)

In [17]:
print ()
print ("Property Split")
print (f"NN Classifier with {model_class}")
print (f"Nearest Neighbours Considered : {num_nearest_neighbours}")
print ()

for key, value in results.items():
    print (key, value)


Property Split
NN Classifier with skip-gram
Nearest Neighbours Considered : 3

binary_f1 0.0953
micro_f1 0.8115
macro_f1 0.495
weighted_f1 0.7539
accuracy 0.8115
classification report               precision    recall  f1-score   support

           0       0.83      0.97      0.89     19829
           1       0.31      0.06      0.10      4242

    accuracy                           0.81     24071
   macro avg       0.57      0.51      0.50     24071
weighted avg       0.74      0.81      0.75     24071

confusion matrix [[19294   535]
 [ 4003   239]]
