In [None]:
import numpy as np
import pandas as pd
import os

import torch
import nltk
import pickle

from model.concept_property_model import ConceptPropertyModel
from utils.functions import create_model
from utils.functions import load_pretrained_model
from utils.functions import read_config
from utils.functions import mcrae_dataset_and_dataloader
from utils.functions import compute_scores

from sklearn.neighbors import NearestNeighbors
from collections import Counter

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# assert os.environ["CONDA_DEFAULT_ENV"] == "gvenv", "Activate 'gvenv' conda environment"

print (f"Device Name : {device}")
print (f"Conda Environment Name : {os.environ['CONDA_DEFAULT_ENV']}")

In [None]:
mcrae_train_df = pd.read_csv("data/evaluation_data/extended_mcrae/train_mcrae.tsv", sep="\t", names=["concept", "property", "label"])
mcrae_test_df = pd.read_csv("data/evaluation_data/extended_mcrae/test_mcrae.tsv", sep="\t", names=["concept", "property", "label"])

print ("McRae Train Df size : ", mcrae_train_df.shape)
print (mcrae_train_df.head())

print ()

print ("McRae Test Df size : ", mcrae_test_df.shape)
print (mcrae_test_df)


In [None]:

train_con_file = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_bert_base_train_cons_embeds.pkl"
train_prop_file = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_bert_base_train_prop_embeds.pkl"

test_con_file = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_bert_base_test_cons_embeds.pkl"
test_prop_file = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_bert_base_test_prop_embeds.pkl"

with open(train_con_file, "rb") as train_con_emb, \
    open(train_prop_file, "rb") as train_prop_emb, \
    open(test_con_file, "rb") as test_con_emb, \
    open(test_prop_file, "rb") as test_prop_emb:
    
    train_con_emb = pickle.load(train_con_emb)
    train_prop_emb = pickle.load(train_prop_emb)
    
    test_con_emb = pickle.load(test_con_emb)
    test_prop_emb = pickle.load(test_prop_emb)


# hawk_train_con_file = "data/evaluation_data/nn_analysis/mcrae_train_concept_embedding.pkl"
# hawk_train_prop_file = "data/evaluation_data/nn_analysis/mcrae_train_properties_embedding.pkl"

# hawk_test_con_file = "data/evaluation_data/nn_analysis/mcrae_test_concept_embedding.pkl"
# hawk_test_prop_file = "data/evaluation_data/nn_analysis/mcrae_test_properties_embedding.pkl"

# with open(hawk_train_con_file, "rb") as train_con_emb, \
#     open(hawk_train_prop_file, "rb") as train_prop_emb, \
#     open(hawk_test_con_file, "rb") as test_con_emb, \
#     open(hawk_test_prop_file, "rb") as test_prop_emb:
    
#     train_con_emb = pickle.load(train_con_emb)
#     train_prop_emb = pickle.load(train_prop_emb)
    
#     test_con_emb = pickle.load(test_con_emb)
#     test_prop_emb = pickle.load(test_prop_emb)


In [None]:
print (train_con_emb.keys())
print (train_prop_emb.keys())

# print ("Train Concepts :", len(train_con_emb.get("name_list_con")))
# print ("Train Concepts :", train_con_emb.get("name_list_con"))

print ("Train Properties :", len(train_prop_emb.get("name_list_prop")))
print ("Train Properties :", train_prop_emb.get("name_list_prop"))

print ()

print (test_con_emb.keys())
print (test_prop_emb.keys())

# print ("Test Concepts :", len(test_con_emb.get("name_list_con")))
# print ("Test Concepts :", test_con_emb.get("name_list_con"))

print ("Test Properties :", len(test_prop_emb.get("name_list_prop")))
print ("Test Properties :", test_prop_emb.get("name_list_prop"))


In [None]:
inter = set(train_prop_emb.get("name_list_prop")).intersection(set(test_prop_emb.get("name_list_prop")))
print (len(inter))
print (inter)

In [None]:
print (len(train_con_emb.get("transformed_con_emb")))
print (len(test_con_emb.get("transformed_con_emb")))

In [None]:
# Learning Nearest Neighbours
num_nearest_neighbours = 3
train_con_nbrs = NearestNeighbors(n_neighbors=num_nearest_neighbours, algorithm='brute', metric='euclidean').fit(np.array(train_con_emb.get("transformed_con_emb")))

con_test_distances, con_test_indices = train_con_nbrs.kneighbors(np.array(test_con_emb.get("transformed_con_emb")))

In [None]:
train_con_emb.keys()

In [None]:
# print (con_test_indices.shape)
# print (con_test_indices)

In [None]:
train_cons_similar_to_test = {}

for idx, con in zip(con_test_indices, test_con_emb.get("name_list_con")):    
    # print (f"Test Concept : {con} : {[train_con_emb.get('name_list_con') [con_id] for con_id in idx]}\n", flush=True)
    
    train_cons_similar_to_test[con] = [train_con_emb.get('name_list_con') [con_id] for con_id in idx]
    

In [None]:
print (len(train_cons_similar_to_test.keys()))
print (train_cons_similar_to_test.keys())

In [None]:
preds = []

for index, row in mcrae_test_df.iterrows():
    print ("Index :", index)
    test_con, test_prop, test_label = row["concept"], row["property"], row["label"]
    train_similar_concepts = train_cons_similar_to_test.get(test_con)
    assert train_similar_concepts is not None, "No Train Similar Concepts for the Test Concept"
    
    # print ("Test Data :", index, test_con, test_prop, test_label)
    # print ("Concepts Similar to test concept in Train File")
    # print (train_similar_concepts)
    
    # print ("Positive properties of similar concepts in train file :") 
    
    positive_prop = []
    
    for train_con in train_similar_concepts:
        positive_property = mcrae_train_df.loc[(mcrae_train_df["concept"] == train_con) & (mcrae_train_df["label"] == 1)]["property"].tolist()
        positive_prop.extend(positive_property)
        # print (train_con, ":",  positive_property)
    
    # print ("positive_prop")
    # print (positive_prop)
    
    prop_dict  = dict(Counter(positive_prop))
    max_prop_count = max(prop_dict.values())
    
    # print (prop_dict)
    # print (max_prop_count)
    
    prop_with_max_count = [] 
    for prop, count in prop_dict.items():
        # print (prop, count)
        
        if count == max_prop_count:
            prop_with_max_count.append(prop)
    
    # print (prop_with_max_count)
    
    # print ("Test Prop :", test_prop)
    # print ("prop_with_max_count :", prop_with_max_count)
    # print ("test_prop in prop_with_max_count :", test_prop in prop_with_max_count)
    
    if test_prop in prop_with_max_count:
        test_pred = 1
    else:
        test_pred = 0
    
    preds.append(test_pred)
    # print ("test_pred :", test_pred)
    
    # print()
    

In [None]:
gold_labels = mcrae_test_df["label"].values

In [None]:
gold_labels

In [None]:
assert len(gold_labels) == len(np.array(preds))

In [None]:
print (Counter(preds))
print (Counter(gold_labels))

In [None]:
results = compute_scores(gold_labels, preds)

In [None]:
print ()
print ("Concept Split")
print (f"NN Classifier with pretrained BERT Base Embedding pretrained on MSCG+PREFIX+GKB Data")
print (f"Nearest Neighbours Considered : {num_nearest_neighbours}")
print ()

for key, value in results.items():
    print (key, value)