In [1]:
import numpy as np
import pandas as pd
import os

import torch
import nltk
import pickle

from model.concept_property_model import ConceptPropertyModel
from utils.functions import create_model
from utils.functions import load_pretrained_model
from utils.functions import read_config
from utils.functions import mcrae_dataset_and_dataloader
from utils.functions import compute_scores
from fine_tune import test_best_model

from sklearn.neighbors import NearestNeighbors
from collections import Counter

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# assert os.environ["CONDA_DEFAULT_ENV"] == "gvenv", "Activate 'gvenv' conda environment"

print (f"Device Name : {device}")
print (f"Conda Environment Name : {os.environ['CONDA_DEFAULT_ENV']}")


import warnings
warnings.filterwarnings("ignore")

Device Name : cuda
Conda Environment Name : gvenv


mcrae_train_df = pd.read_csv("data/evaluation_data/extended_mcrae/train_mcrae.tsv", sep="\t", names=["concept", "property", "label"])
mcrae_test_df = pd.read_csv("data/evaluation_data/extended_mcrae/test_mcrae.tsv", sep="\t", names=["concept", "property", "label"])

print ("McRae Train Df size : ", mcrae_train_df.shape)
print (mcrae_train_df.head())

print ()

print ("McRae Test Df size : ", mcrae_test_df.shape)
print (mcrae_test_df)


In [2]:
def transform(vecs):
    
    maxnorm = max([np.linalg.norm(v) for v in vecs])
    new_vecs = []
    
    for v in vecs:
        new_vecs.append(np.insert(v, 0, np.sqrt(maxnorm**2-np.linalg.norm(v)**2)))
    
    return new_vecs

In [3]:

local_config_file_path = "configs/nn_analysis/nn_classifier_fine_tuned_bert_base_all_data.json"
hawk_config_file_path = "configs/nn_analysis/hawk_nn_classifier_fine_tuned_bert_base_all_data.json"

model_config = read_config(hawk_config_file_path)

print ("model_config")
print (model_config)

print ('model_config["dataset_params"]')
print (model_config["dataset_params"])

config.get('experiment_name') : nn_classifier

log_file_name : logs/nn_analysis/log_nn_classifier_16-05-2022_13-05-01.txt
model_config
{'dataset_name': 'prop_split_data', 'train_file_path': 'None', 'val_file_path': 'None', 'test_file_path': 'None', 'hf_tokenizer_name': 'bert-base-uncased', 'hf_tokenizer_path': '/home/amitgajbhiye/cardiff_work/100k_data_experiments/bert_base_uncased_pretrained/tokenizer', 'concept_max_len': 64, 'property_max_len': 64, 'add_context': True, 'context_num': 6, 'loader_params': {'batch_size': 20, 'num_workers': 4, 'pin_memory': True}}


In [4]:
def predict_label(train_props_similar_to_test, train_df, test_df):

    preds = []

    for index, row in test_df.iterrows():
        
        print ()
        print ("Index :", index)
        test_con, test_prop, test_label = row["concept"], row["property"], row["label"]
        
        train_similar_props = train_props_similar_to_test.get(test_prop)
        
        assert train_similar_props is not None, "No Train Similar Properties for the Test Property"

        print ("Test Data :", index, test_con, test_prop, test_label)
        
        print ("Properties Similar to test property in Train File")
        print (train_similar_props)


        positive_con = []

        for train_prop in train_similar_props:
            
            positive_concept = train_df.loc[(train_df["property"] == train_prop) & (train_df["label"] == 1)]["concept"].tolist()
            positive_con.extend(positive_concept)
            
            print (f"Positive Concept for the similar property: {train_prop}")
            print (positive_concept)
            
        print ("All positive Concepts")
        print (positive_con)

        con_dict  = dict(Counter(positive_con))
        max_con_count = max(con_dict.values())

        print (con_dict)
        print (max_con_count)

        con_with_max_count = [] 
        
        for con, count in con_dict.items():
            if count == max_con_count:
                con_with_max_count.append(con)

        
        print ("con_with_max_count :", con_with_max_count)

        print ("Test Con :", test_con)
        print ("con_with_max_count :", con_with_max_count)
        print ("test_con in con_with_max_count :", test_con in con_with_max_count)

        if test_con in con_with_max_count:
            test_pred = 1
        else:
            test_pred = 0

        preds.append(test_pred)
        
        print ("test_pred :", test_pred)
        print()
    
    print (f"All Preds for fold : {len(preds)}")
    # print (f"All Preds for fold : {preds}")
    return preds


In [5]:
#### Loading the BERT Base Model for generating Property Embedding

num_nearest_neighbours = 1
torch.cuda.empty_cache()

local_base_path = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/prop_split_train_test_files"

hawk_base_path = "data/evaluation_data/nn_analysis/prop_split_train_test_files"

all_gold_labels, all_preds  = [], []

for x in range(5):
    
    print (f"For Fold {x}")
    train_file_path = os.path.join(hawk_base_path, f"{x}_train_prop_split_con_prop.pkl")
    test_file_path = os.path.join(hawk_base_path, f"{x}_test_prop_split_con_prop.pkl")
    
    print (train_file_path)
    print (test_file_path)
    print ()
    
    with open (train_file_path, "rb") as train_file, open (test_file_path, "rb") as test_file:
        train_df = pickle.load(train_file)
        test_df = pickle.load(test_file)
    
    print (f"Train Df shape : {train_df.shape}, {train_df.columns}")
    print (f"Test Df shape : {test_df.shape}, {test_df.columns}")
    
    train_concept = train_df["concept"].unique()
    name_train_concept = train_concept
    
    train_prop = train_df["property"].unique()
    name_train_prop = train_prop
    
    
    train_concept = [(con, "dummy_prop", int(0)) for con in train_concept]
    train_prop = [("dummy_con", prop, int(0)) for prop in train_prop]
    
    test_concept = test_df["concept"].unique()
    name_test_concept = test_concept
    
    test_prop = test_df["property"].unique()
    name_test_prop = test_prop
    
    test_concept = [(con, "dummy_prop", int(0)) for con in test_concept]
    test_prop = [("dummy_con", prop, int(0)) for prop in test_prop]
    
    print (f"#Unique Train Concepts : {len(train_concept)}")
    print (f"#Unique Train Property : {len(train_prop)}")
    
    print ()
    print (f"#Unique Test Concepts : {len(test_concept)}")
    print (f"#Unique Test Property : {len(test_prop)}")
    
    print()
    print (f"Concept Intersection : {len(set(train_concept).intersection(test_concept))}")
    print (f"Property Intersection : {len(set(train_prop).intersection(test_prop))}")
    print ()
    
    for i, con_list in enumerate([train_concept, test_concept]):
        
        concept_df = pd.DataFrame.from_records(con_list, columns=["concept", "property", "label"])
        model_config["dataset_params"]["loader_params"]["batch_size"] = concept_df.shape[0]
        
        print()
        print (f"Concept i : {i}")
        print (concept_df.head())
        if i == 0:
            
            print ("Train Concepts")
            torch.cuda.empty_cache()
            
            print (f'Batch Size : {model_config["dataset_params"]["loader_params"]["batch_size"]}') 
            train_concept_embs, _, _, _ = test_best_model(model_config, test_df=concept_df, fold=None)
            print ("train_concept_embs.shape :", train_concept_embs.shape)
            
            train_concept_embs = [x.cpu().numpy() for x in train_concept_embs]
            transformed_train_concept_embs = transform(train_concept_embs)
            
            print (f"len(transformed_train_concept_embs) : {len(transformed_train_concept_embs)}")
            
        elif i == 1:
            print ("Test Concepts")
            torch.cuda.empty_cache()
            print (f'Batch Size : {model_config["dataset_params"]["loader_params"]["batch_size"]}') 
            test_concept_embs, _, _, _ = test_best_model(model_config, test_df=concept_df, fold=None)
            print ("test_concept_embs.shape :", test_concept_embs.shape)
            
            test_concept_embs = [x.cpu().numpy() for x in test_concept_embs]
               
            transformed_test_concept_embs = transform(test_concept_embs)
            
            print (f"len(transformed_test_concept_embs) : {len(transformed_test_concept_embs)}")
    
    for i, prop_list in enumerate([train_prop, test_prop]):
            
        property_df = pd.DataFrame.from_records(prop_list, columns=["concept", "property", "label"])       
        model_config["dataset_params"]["loader_params"]["batch_size"] = property_df.shape[0]
        
        print()
        print (f"Property i : {i}")
        print (property_df.head())
        
        if i == 0:
            
            print ("Train Property")
            torch.cuda.empty_cache()
            print (f'Batch Size : {model_config["dataset_params"]["loader_params"]["batch_size"]}')
            _, train_property_embs, _, _ = test_best_model(model_config, test_df=property_df, fold=None)
            print (f"train_property_embs.shape : {train_property_embs.shape}")
            
            train_property_embs = [x.cpu().numpy() for x in train_property_embs]
            transformed_train_property_embs = transform(train_property_embs)
            
            print (f"len(transformed_train_property_embs) : {len(transformed_train_property_embs)}")
        
        elif i ==1:
            
            print ("Test Property")
            torch.cuda.empty_cache()
            print (f'Batch Size : {model_config["dataset_params"]["loader_params"]["batch_size"]}')
            _, test_property_embs, _, _ = test_best_model(model_config, test_df=property_df, fold=None)
            print (f"test_property_embs.shape : {test_property_embs.shape}")
            
            test_property_embs = [x.cpu().numpy() for x in test_property_embs]
            transformed_test_property_embs = transform(test_property_embs)
            
            print (f"len(transformed_test_property_embs) : {len(transformed_test_property_embs)}")
            
    
    train_prop_nbrs = NearestNeighbors(n_neighbors=num_nearest_neighbours, algorithm='brute', metric='euclidean').fit(np.array(transformed_train_property_embs))
    prop_test_distances, prop_test_indices = train_prop_nbrs.kneighbors(np.array(transformed_test_property_embs))
    
    train_props_similar_to_test = {}
    
    for idx, prop in zip(prop_test_indices, name_test_prop):
        
        train_props_similar_to_test[prop] = [name_train_prop[prop_id] for prop_id in idx]
        print (f"{prop} : {train_props_similar_to_test[prop]}")
    
    
    gold_label_for_fold = test_df["label"].values
    pred_for_fold = predict_label(train_props_similar_to_test, train_df, test_df)
    
    all_gold_labels.extend(gold_label_for_fold)
    all_preds.extend(pred_for_fold)
    
    
all_gold_labels = np.array(all_gold_labels)
all_preds = np.array(all_preds)

print ("Finished")


For Fold 0
/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/prop_split_train_test_files/0_train_prop_split_con_prop.pkl
/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/prop_split_train_test_files/0_test_prop_split_con_prop.pkl

Train Df shape : (19273, 3), Index(['concept', 'property', 'label'], dtype='object')
Test Df shape : (4798, 3), Index(['concept', 'property', 'label'], dtype='object')
#Unique Train Concepts : 514
#Unique Train Property : 40

#Unique Test Concepts : 514
#Unique Test Property : 10

Concept Intersection : 514
Property Intersection : 0


Concept i : 0
      concept    property  label
0  microscope  dummy_prop      0
1     emerald  dummy_prop      0
2        tray  dummy_prop      0
3      pliers  dummy_prop      0
4   asparagus  dummy_prop      0
Train Concepts
Batch Size : 514

train_concept_embs.shape : torch.Size([514, 768])
len(transformed_train_concept_embs) : 514

Concept i : 1


In [6]:
len(all_preds)

20

In [7]:
len(all_gold_labels)

20

In [9]:
assert len(all_gold_labels) == len(all_preds)

In [10]:
print (Counter(all_preds))
print (Counter(all_gold_labels))

Counter({0: 20})
Counter({0: 17, 1: 3})


In [12]:
results = compute_scores(all_gold_labels, all_preds)

In [13]:
print ()
print ("Property Split")
print (f"NN Classifier with pretrained BERT Base Embedding pretrained on MSCG+PREFIX+GKB Data")
print (f"Nearest Neighbours Considered : {num_nearest_neighbours}")
print ()

for key, value in results.items():
    print (key, value)


Property Split
NN Classifier with pretrained BERT Base Embedding pretrained on MSCG+PREFIX+GKB Data
Nearest Neighbours Considered : 3

binary_f1 0.0
micro_f1 0.85
macro_f1 0.4595
weighted_f1 0.7811
accuracy 0.85
classification report               precision    recall  f1-score   support

           0       0.85      1.00      0.92        17
           1       0.00      0.00      0.00         3

    accuracy                           0.85        20
   macro avg       0.42      0.50      0.46        20
weighted avg       0.72      0.85      0.78        20

confusion matrix [[17  0]
 [ 3  0]]
