In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from utils.functions import compute_scores

In [2]:
def _random(y_test_shape):
    
    return np.random.uniform(size=y_test_shape).round().astype("int")

In [3]:
def _all_0(y_test_shape):
    
    return np.zeros(y_test_shape).round().astype("int")

In [4]:
def _all_1(y_test_shape):
    
    return np.ones(y_test_shape).round().astype("int")

In [5]:
mcrae_train_df = pd.read_csv("data/evaluation_data/extended_mcrae/train_mcrae.tsv", sep="\t", names=["concept", "property", "label"])
mcrae_test_df = pd.read_csv("data/evaluation_data/extended_mcrae/test_mcrae.tsv", sep="\t", names=["concept", "property", "label"])

print ("McRae Train Df size : ", mcrae_train_df.shape)
print (mcrae_train_df.head())

print ()

print ("McRae Test Df size : ", mcrae_test_df.shape)
print (mcrae_test_df)


McRae Train Df size :  (19258, 3)
      concept          property  label
0      wrench           squishy      0
1  teddy_bear           fragile      0
2  microscope  used by children      1
3      shovel  used for killing      0
4        wall    found on walls      1

McRae Test Df size :  (4813, 3)
         concept          property  label
0         onions    used for music      0
1          pizza             shiny      0
2     motorcycle   eaten in summer      0
3       sailboat              cold      0
4           lime      worn on feet      0
...          ...               ...    ...
4808   snowboard              tall      0
4809        veil           squishy      0
4810    sailboat      light weight      0
4811  skateboard  used for cooking      0
4812    mandarin             a toy      0

[4813 rows x 3 columns]


In [9]:
con_split_gold_labels = mcrae_test_df["label"].values

con_split_random_labels = _random(con_split_gold_labels.shape)

mcrae_con_split_random_baseline = compute_scores(con_split_gold_labels, con_split_random_labels)

print()
print ("McRae Dataset - Concept Split Random Baseline Results")
for key, value in mcrae_con_split_random_baseline.items():
    print (f'{key} : {value}')


McRae Dataset - Concept Split Random Baseline Results
binary_f1 : 0.2598
micro_f1 : 0.4968
macro_f1 : 0.4393
weighted_f1 : 0.5547
accuracy : 0.4968
classification report :               precision    recall  f1-score   support

           0       0.82      0.50      0.62      3954
           1       0.18      0.49      0.26       859

    accuracy                           0.50      4813
   macro avg       0.50      0.50      0.44      4813
weighted avg       0.70      0.50      0.55      4813

confusion matrix : [[1966 1988]
 [ 434  425]]


#### Property Split Random Baseline

In [10]:

local_base_path = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_prop_split_train_test_files"


all_gold_labels, all_preds  = [], []

for x in range(5):
    
    print (f"For Fold {x}")
    train_file_path = os.path.join(local_base_path, f"{x}_train_prop_split_con_prop.pkl")
    test_file_path = os.path.join(local_base_path, f"{x}_test_prop_split_con_prop.pkl")
    
    # print (train_file_path)
    # print (test_file_path)
    # print ()
    
    with open (train_file_path, "rb") as train_file, open (test_file_path, "rb") as test_file:
        train_df = pickle.load(train_file)
        test_df = pickle.load(test_file)
    
    # print (f"Train Df shape : {train_df.shape}, {train_df.columns}")
    # print (f"Test Df shape : {test_df.shape}, {test_df.columns}")
    
    gold_label_for_fold = test_df["label"].values
    random_label_for_fold = _random(gold_label_for_fold.shape)
        
    
    all_gold_labels.extend(gold_label_for_fold)
    all_preds.extend(random_label_for_fold)
    
    
all_gold_labels = np.array(all_gold_labels)
all_preds = np.array(all_preds)

print (all_gold_labels.shape)
print (all_preds.shape)

mcrae_prop_split_random_baseline = compute_scores(all_gold_labels, all_preds)

print()
print ("McRae Dataset - Property Split Random Baseline Results")
for key, value in mcrae_prop_split_random_baseline.items():
    print (f'{key} : {value}')


For Fold 0
For Fold 1
For Fold 2
For Fold 3
For Fold 4
(24071,)
(24071,)

McRae Dataset - Property Split Random Baseline Results
binary_f1 : 0.2654
micro_f1 : 0.5033
macro_f1 : 0.4451
weighted_f1 : 0.5615
accuracy : 0.5033
classification report :               precision    recall  f1-score   support

           0       0.83      0.50      0.62     19829
           1       0.18      0.51      0.27      4242

    accuracy                           0.50     24071
   macro avg       0.50      0.51      0.45     24071
weighted avg       0.71      0.50      0.56     24071

confusion matrix : [[9955 9874]
 [2082 2160]]


In [12]:

local_base_path = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/mcrae_con_prop_split_train_test_files"


all_gold_labels, all_preds  = [], []

for x in range(9):
    
    print (f"For Fold {x}")
    train_file_path = os.path.join(local_base_path, f"{x}_train_con_prop_split_con_prop.pkl")
    test_file_path = os.path.join(local_base_path, f"{x}_test_con_prop_split_con_prop.pkl")
    
    # print (train_file_path)
    # print (test_file_path)
    # print ()
    
    with open (train_file_path, "rb") as train_file, open (test_file_path, "rb") as test_file:
        train_df = pickle.load(train_file)
        test_df = pickle.load(test_file)
    
    # print (f"Train Df shape : {train_df.shape}, {train_df.columns}")
    # print (f"Test Df shape : {test_df.shape}, {test_df.columns}")
    
    gold_label_for_fold = test_df["label"].values
    random_label_for_fold = _random(gold_label_for_fold.shape)
        
    
    all_gold_labels.extend(gold_label_for_fold)
    all_preds.extend(random_label_for_fold)
    
    
all_gold_labels = np.array(all_gold_labels)
all_preds = np.array(all_preds)

print (all_gold_labels.shape)
print (all_preds.shape)

mcrae_con_prop_split_random_baseline = compute_scores(all_gold_labels, all_preds)

print()
print ("McRae Dataset - Concept Property Split Random Baseline Results")
for key, value in mcrae_con_prop_split_random_baseline.items():
    print (f'{key} : {value}')


For Fold 0
For Fold 1
For Fold 2
For Fold 3
For Fold 4
For Fold 5
For Fold 6
For Fold 7
For Fold 8
(24071,)
(24071,)

McRae Dataset - Concept Property Split Random Baseline Results
binary_f1 : 0.2595
micro_f1 : 0.4975
macro_f1 : 0.4396
weighted_f1 : 0.5562
accuracy : 0.4975
classification report :               precision    recall  f1-score   support

           0       0.82      0.50      0.62     19829
           1       0.18      0.50      0.26      4242

    accuracy                           0.50     24071
   macro avg       0.50      0.50      0.44     24071
weighted avg       0.71      0.50      0.56     24071

confusion matrix : [[9856 9973]
 [2123 2119]]


## CSLB Data

In [13]:
"data/evaluation_data/CSLB/20_neg_cslb_train_pos_neg_data.tsv"
"data/evaluation_data/CSLB/20_neg_cslb_test_pos_neg_data.tsv"

cslb_train_df = pd.read_csv("data/evaluation_data/CSLB/20_neg_cslb_train_pos_neg_data.tsv", sep="\t", names=["concept", "property", "label"])
cslb_test_df = pd.read_csv("data/evaluation_data/CSLB/20_neg_cslb_test_pos_neg_data.tsv", sep="\t", names=["concept", "property", "label"])

print ("CSLB Train Df size : ", cslb_train_df.shape)
print (cslb_train_df.head())

print ()

print ("CSLB Test Df size : ", cslb_test_df.shape)
print (cslb_test_df)


CSLB Train Df size :  (224553, 3)
   concept              property  label
0  coconut            has petals      0
1    otter            has a roof      0
2      dog         has a trigger      0
3    spoon       does eat leaves      0
4    basin  is used for holidays      0

CSLB Test Df size :  (24108, 3)
       concept              property  label
0         veil         is eaten from      0
1        skirt       made of plastic      0
2       gloves              is baked      0
3       carrot              is built      0
4      sardine             does walk      0
...        ...                   ...    ...
24103   buckle          is dangerous      0
24104   buckle  does come from goats      0
24105   candle             is stable      0
24106  sardine              is shiny      0
24107   pencil        has a headband      0

[24108 rows x 3 columns]


In [14]:
cslb_con_split_gold_labels = cslb_test_df["label"].values


cslb_con_split_random_labels = _random(cslb_con_split_gold_labels.shape)

cslb_con_split_random_baseline = compute_scores(cslb_con_split_gold_labels, cslb_con_split_random_labels)

print()
print ("CSLB Dataset - Concept Split Random Baseline Results")
for key, value in cslb_con_split_random_baseline.items():
    print (f'{key} : {value}')


CSLB Dataset - Concept Split Random Baseline Results
binary_f1 : 0.0863
micro_f1 : 0.4995
macro_f1 : 0.3708
weighted_f1 : 0.6283
accuracy : 0.4995
classification report :               precision    recall  f1-score   support

           0       0.95      0.50      0.66     22960
           1       0.05      0.50      0.09      1148

    accuracy                           0.50     24108
   macro avg       0.50      0.50      0.37     24108
weighted avg       0.91      0.50      0.63     24108

confusion matrix : [[11472 11488]
 [  578   570]]


In [20]:

local_base_path = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/cslb_prop_split_train_test_files"


all_gold_labels, all_preds  = [], []

for x in range(5):
    
    print (f"For Fold {x}")
    train_file_path = os.path.join(local_base_path, f"{x}_train_prop_split_con_prop.pkl")
    test_file_path = os.path.join(local_base_path, f"{x}_test_prop_split_con_prop.pkl")
    
    # print (train_file_path)
    # print (test_file_path)
    # print ()
    
    with open (train_file_path, "rb") as train_file, open (test_file_path, "rb") as test_file:
        train_df = pickle.load(train_file)
        test_df = pickle.load(test_file)
    
    # print (f"Train Df shape : {train_df.shape}, {train_df.columns}")
    # print (f"Test Df shape : {test_df.shape}, {test_df.columns}")
    
    gold_label_for_fold = test_df["label"].values
    random_label_for_fold = _random(gold_label_for_fold.shape)
        
    
    all_gold_labels.extend(gold_label_for_fold)
    all_preds.extend(random_label_for_fold)
    
    
all_gold_labels = np.array(all_gold_labels)
all_preds = np.array(all_preds)

print (all_gold_labels.shape)
print (all_preds.shape)

cslb_prop_split_random_baseline = compute_scores(all_gold_labels, all_preds)

print()
print ("CSLB Dataset - Property Split Random Baseline Results")
for key, value in cslb_prop_split_random_baseline.items():
    print (f'{key} : {value}')


For Fold 0
For Fold 1
For Fold 2
For Fold 3
For Fold 4
(248607,)
(248607,)

CSLB Dataset - Property Split Random Baseline Results
binary_f1 : 0.0843
micro_f1 : 0.5007
macro_f1 : 0.3706
weighted_f1 : 0.6295
accuracy : 0.5007
classification report :               precision    recall  f1-score   support

           0       0.95      0.50      0.66    236768
           1       0.05      0.48      0.08     11839

    accuracy                           0.50    248607
   macro avg       0.50      0.49      0.37    248607
weighted avg       0.91      0.50      0.63    248607

confusion matrix : [[118773 117995]
 [  6124   5715]]


In [57]:

local_base_path = "/home/amitgajbhiye/cardiff_work/dot_product_model_nn_analysis/mcrae_train_test_embeddings/cslb_con_prop_split_train_test_files"


all_gold_labels, all_preds  = [], []

for x in range(9):
    
    print (f"For Fold {x}")
    train_file_path = os.path.join(local_base_path, f"{x}_train_con_prop_split_con_prop.pkl")
    test_file_path = os.path.join(local_base_path, f"{x}_test_con_prop_split_con_prop.pkl")
    
#     print (train_file_path)
#     print (test_file_path)
#     print ()
    
    with open (train_file_path, "rb") as train_file, open (test_file_path, "rb") as test_file:
        train_df = pickle.load(train_file)
        test_df = pickle.load(test_file)
    
    # print (f"Train Df shape : {train_df.shape}, {train_df.columns}")
    # print (f"Test Df shape : {test_df.shape}, {test_df.columns}")
    
    gold_label_for_fold = test_df["label"].values
    random_label_for_fold = _random(gold_label_for_fold.shape)
        
    
    all_gold_labels.extend(gold_label_for_fold)
    all_preds.extend(random_label_for_fold)
    
    
all_gold_labels = np.array(all_gold_labels)
all_preds = np.array(all_preds)

print (all_gold_labels.shape)
print (all_preds.shape)

cslb_con_prop_split_random_baseline = compute_scores(all_gold_labels, all_preds)

print()
print ("CSLB Dataset - Concept Property Split Random Baseline Results")
for key, value in cslb_con_prop_split_random_baseline.items():
    print (f'{key} : {value}')


For Fold 0
For Fold 1
For Fold 2
For Fold 3
For Fold 4
For Fold 5
For Fold 6
For Fold 7
For Fold 8
(248607,)
(248607,)

CSLB Dataset - Concept Property Split Random Baseline Results
binary_f1 : 0.0859
micro_f1 : 0.501
macro_f1 : 0.3714
weighted_f1 : 0.6297
accuracy : 0.501
classification report :               precision    recall  f1-score   support

           0       0.95      0.50      0.66    236768
           1       0.05      0.49      0.09     11839

    accuracy                           0.50    248607
   macro avg       0.50      0.50      0.37    248607
weighted avg       0.91      0.50      0.63    248607

confusion matrix : [[118739 118029]
 [  6014   5825]]


import gensim.downloader
import warnings

# warnings.filterwarnings("ignore")

from gensim.models import KeyedVectors

print(list(gensim.downloader.info()['models'].keys()))

wv_google_news = gensim.downloader.load('word2vec-google-news-300')

wv_google_news["hello"]

wv_google_news.save("/home/amitgajbhiye/cardiff_work/word2vec-word2vec-google-news-300/w2v_google_news_300")

wv = KeyedVectors.load("/home/amitgajbhiye/cardiff_work/word2vec-word2vec-google-news-300/w2v_google_news_300")

wv ["dog"]