In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [None]:
# !ls -lh ./../../data/CSLB_Property_Norms_V1.1/

In [None]:
local_file_path = "/home/amitgajbhiye/cardiff_work/data/CSLB_Property_Norms_V1.1/norms.dat"
hawk_file_path = "CSLB_Property_Norms_V1.1/norms.dat"

In [None]:
def process_row(concept, feature_alternatives, participant_list):
    
    # print (concept)
    # print (feature_alternatives.split(";"))
    # print (participant_list.split("/"))
    # print ()
    
    # print (f"Processing Record : {concept, feature_alternatives, participant_list}")
    feature_list = feature_alternatives.split(";")
    feature_list = [x.strip() for x in feature_list]
    # print (f"feature_list : {feature_list}")
    participant_list = participant_list.split("/")
    # print (f"participant_list 1: {participant_list}")
    participant_list = [len(x.replace("p", "").strip().split()) for x in participant_list]
    # print (f"participant_list 2: {participant_list}")
    
    max_value = max (participant_list)
    # print (f"max_value : {max_value}")
    
    concept_data = []
    if max_value >= 3:
        max_index = participant_list.index(max_value)
        # print ("feature_list :", feature_list)
        # print (f"max_index :", {max_index})
        # print (f"max_value : {max_value}")
        feature = feature_list[max_index]
        # print (f"feature_list : {feature_list}")
        # print (f"feature: {feature}")
        # print ()
        
        concept_data.append((concept.strip(), feature.strip(), max_value))
    
    return concept_data
    

In [None]:
def cslb_processing(file_path):
    
    df = pd.read_csv(file_path, sep="\t", header="infer")
    
    all_data_list = [process_row(row[0], row[1], row[2]) for row in zip(df["concept"], df["feature alternatives"], df["participant list"])]
    
    all_data_list = [con_feature_count for sublist in all_data_list for con_feature_count in sublist]
    # print (all_data_list)
    
    all_data_df = pd.DataFrame.from_records(all_data_list, columns=["concept", "property", "participant_count"])
    
    
    print ("Original CSLB Dataframe size : ", df.shape)
    print ("Processed Data Before Removing Duplicates :", all_data_df.shape)
    
    duplicated_df = all_data_df.loc[all_data_df.duplicated(subset = ["concept", "property"], keep=False)]
    duplicates_df_idx = duplicated_df.index
    
    all_data_df = all_data_df.drop(index=duplicates_df_idx)
    all_data_df = all_data_df.dropna(axis=0, how="any")
    all_data_df.reset_index(inplace=True, drop=True)
    
    print ("Duplicated Data :", duplicated_df.shape)
    
    print ("Data after After Removing Duplicates :", all_data_df.shape)
    
    return all_data_df

df_cslb = cslb_processing(file_path=hawk_file_path)

In [None]:
print (df_cslb)

In [None]:
df_cslb.drop("participant_count", axis=1, inplace=True)

In [None]:
print()

In [None]:
def negative_sampling(df, data_type, num_negative=5):
    
    pos_data_list = df.values.tolist()
    
    df["label"] = int(1)
    
    # df.reset_index(drop=True, inplace=True)
    print (df.shape)
    print (df.head())
    print ()
    
    unique_concepts = df["concept"].unique()
    unique_properties = df["property"].unique()
    
    print (f"Number of Unique Concepts in Dataframe :", len(unique_concepts))
    
    all_negative_data = []
    
    for concept in unique_concepts:
        
        concept_data = df[df["concept"] == concept]
        num_record = len(concept_data)
        
        print()
        print (f"Generating Negative Data for Concept : {concept}")
        print (f"Postive data for concept in DF : {concept_data.shape}")
        
        total_neg_num = num_record * num_negative
        
        print (f"Total Number of Negative Records to be generated : {total_neg_num}")
        
        rest_df = df[df["concept"] != concept]
        print (f"rest_df.shape : {rest_df.shape}")
        
        concept_neg_data = []
        
        while (True):
            
            concept = concept.strip()
            neg_properties = list(rest_df["property"].sample(n = total_neg_num))
            
            neg_data = [[concept, neg_prop] for neg_prop in neg_properties]
            print (f"neg_data length :", len(neg_data))
            
            if len(concept_neg_data) < total_neg_num:
                for x in neg_data:
                    if not (x in pos_data_list):
                        if not (x in all_negative_data):

                            all_negative_data.append(x)
                            concept_neg_data.append(x)
                            
                            if len(concept_neg_data) == total_neg_num:
                                break
                                
            if len(concept_neg_data) == total_neg_num:
                break
            
        print (f"Number of negative records generated : {len(concept_neg_data)}")
        print (f"Negative Records")
        print (concept_neg_data)
        print ()
                        
    
    _ = [x.insert(2, int(0)) for x in all_negative_data]
    
    # print ("all_negative_data")
    # print (all_negative_data)
                                        
    all_neg_data_df = pd.DataFrame.from_records(all_negative_data, columns=["concept", "property", "label"])
    
    neg_data_duplicate_records = all_neg_data_df[all_neg_data_df.duplicated(["concept", "property"])]
    
    print ()
    print (f"all_neg_data_df.shape : {all_neg_data_df.shape}")
    print (f"neg_data_duplicate_records.shape : {neg_data_duplicate_records.shape}")
    print ()
        
    print (f"Checking overlap between positive and negative data")
    pos_neg_overlap_df = df.merge(all_neg_data_df, how = 'inner', on = ["concept", "property"], indicator=False)
    print(f"Positive and Negative Overlapped Dataframe")
    print (pos_neg_overlap_df)
    print()
    
    pos_neg_df = pd.concat([df, all_neg_data_df], axis=0, ignore_index=True)
            
    print ("DF after adding negative data")
    print (pos_neg_df.shape)
    
    duplicate_records = pos_neg_df[pos_neg_df.duplicated(["concept", "property"])]
    
    print (f"Duplicate Records : {duplicate_records.shape}")
    print (f"Duplicate record label value count: {duplicate_records['label'].value_counts()}")
    print()
    
    pos_neg_df = pos_neg_df[~pos_neg_df.duplicated(subset=["concept", "property"], keep="first")]
    
    print (f"Dataframe after removing duplicates : {pos_neg_df.shape}")
    
    if data_type == "train":
        pos_neg_df.to_csv("cslb_train_pos_neg_data.tsv", sep='\t', index=None, header=None)
    elif data_type == "test":
        pos_neg_df.to_csv("cslb_test_pos_neg_data.tsv", sep='\t', index=None, header=None)


In [None]:
print(df_cslb.shape)

In [None]:
unique_concepts = df_cslb["concept"].unique()

print (f"Unique Concepts : {len(unique_concepts)}")

test_concepts = np.random.choice(a=unique_concepts, size = int(0.1 * len(unique_concepts)), replace=False)

print (f"Number of test concepts : {len(test_concepts)}")

test_df = df_cslb[df_cslb["concept"].isin(test_concepts)]
train_df = df_cslb[~df_cslb["concept"].isin(test_concepts)]

print ()
print ("Total CSLB DF shape :", df_cslb.shape)
print ("Train DF shape :", train_df.shape, train_df.columns)
print ("Test DF shape :", test_df.shape, test_df.columns)

print ("Checking Train Test DF Merge")
df1 = train_df.merge(test_df, how="inner", on = ["concept"], indicator=False )

print (df1)

In [None]:
negative_sampling(train_df, "train", num_negative=5)
negative_sampling(test_df, "test", num_negative=5)