In [None]:
import torch
import pandas as pd
import numpy as np

from collections import Counter

In [None]:
#file_path = "/scratch/c.scmag3/conceptEmbeddingModel/data/data-concept-instance-relations.txt" 
file_path = "./../../data/ms_concept_graph/data-concept-instance-relations.txt"
data_df = pd.read_csv(file_path, header=None, names=["property", "concept", "num_relation"], sep="\t")

In [None]:
print ('Data After Loading MS Concept Graph')
data_df

In [None]:
data_df.sort_values('num_relation', ascending=False, inplace=True)

In [None]:
print ('Data After Sorting on Num Relation')
data_df

In [None]:
data_df = data_df[['concept', 'property', 'num_relation']]
data_df

In [None]:
data_df = 

In [None]:
top_k = 15000

In [None]:
data_df = data_df[0:top_k]
print (f'Data Df after taking top {top_k} records \n:')
data_df

In [None]:
unique_concept_list = data_df["concept"].unique()
np.random.shuffle(unique_concept_list)
print (unique_concept_list.shape[0])
print (type(unique_concept_list))

In [None]:
num_unique_concepts = len(unique_concept_list)
print ('Num Unique Positive Concepts:', num_unique_concepts)

In [None]:
print (data_df["concept"].value_counts())

In [None]:
train_ratio, val_ratio, test_ratio = 0.85, 0.10, 0.05

In [None]:
num_train_concepts = int(num_unique_concepts * train_ratio)
num_val_concepts = int(num_unique_concepts * val_ratio)
num_test_concepts = int(num_unique_concepts * test_ratio)

print (num_train_concepts)
print (num_val_concepts)
print (num_test_concepts)


In [None]:
train_concept = unique_concept_list[0:num_train_concepts]
valid_concept = unique_concept_list[num_train_concepts:num_train_concepts + num_val_concepts]
test_concept = unique_concept_list[num_train_concepts + num_val_concepts:]

print ('Number of Positive Train Concept :', len(train_concept))
print ('Number of Positive Valid Concept :', len(valid_concept))
print ('Number of Positive Test Concept :', len(test_concept))


print ('\nTrain Concept :', *train_concept, sep=", ")
print ('\nValid Concept :', *valid_concept, sep=", ")
print ('\nTest Concept :', *test_concept, sep=", ")


In [None]:
print ('Total Number of Positive Concepts :', num_train_concepts + num_val_concepts + num_test_concepts)

In [None]:
print ('+' * 50)

print ('Asserting Train Concept, Valid Concepts and Test Concpets Do Not Overlap')
assert set (train_concept) != set (valid_concept)
assert set (train_concept) != set (test_concept)
assert set (valid_concept) != set (test_concept)

print ('Assertions Passed')

In [None]:
train_df = data_df[data_df['concept'].isin(train_concept)]
valid_df = data_df[data_df['concept'].isin(valid_concept)]
test_df = data_df[data_df['concept'].isin(test_concept)]

print ('Number of Train Records :', len(train_df))
print ('Number of Valid Records :', len(valid_df))
print ('NUmber of Test Records :', len(test_df))

print ('+' * 50)

In [None]:
print ('\ncheck if train, val and test dataframes have any overlap')
df = train_df.merge(test_df, how = 'inner' ,indicator=False)
print('\n', df)

df = train_df.merge(valid_df, how = 'inner' ,indicator=False)
print('\n', df)

df = test_df.merge(valid_df, how = 'inner', indicator=False)
print('\n', df)

In [None]:
train_df = train_df.drop("num_relation", axis=1)

In [None]:
train_df.to_csv("mscg_new_pos_train.tsv", sep="\t", index=None, header=None)

In [None]:
valid_df = valid_df.drop("num_relation", axis=1)

In [None]:
valid_df.to_csv("mscg_new_pos_valid.tsv", sep="\t", index=None, header=None)

In [None]:
test_df = test_df.drop("num_relation", axis=1)

In [None]:
def generate_pos_neg_test_data(test_df):
    
    # print (test_df.head())
    
    test_df.reset_index(drop=True, inplace=True)
    test_df = test_df.drop("num_relation", axis=1)
    test_df["label"] = 1
    
    test_df.set_index("concept", drop=False)
    
    print (test_df.head())
    print (test_df.shape)
    
    unique_test_concept = test_df["concept"].unique()
    
    print (len(unique_test_concept))
    
    negative_data = []
    for concept in unique_test_concept:
        
        temp_df = test_df[test_df["concept"] != concept]
        
        # print ("temp_df")
        # print (temp_df)
        
        sampled_property = np.random.choice(temp_df["property"].unique(), size=5, replace=False)
        
        # print ("sampled_property")
        # print (sampled_property)
        
        negative_data.append([(concept, prop, 0) for prop in sampled_property])
    
    # print ("negative_data")
    # print (len(negative_data))
    # print (negative_data)
    
    negative_data_df = pd.concat([pd.DataFrame(data, columns=['concept', 'property', 'label']) \
                                  for data in negative_data], ignore_index=True)

    print("negative_data_df")
    print (negative_data_df.shape)
    
    all_data = pd.concat([test_df, negative_data_df], ignore_index=True)
    
    print("All Data")
    print (all_data.shape)
    
    all_data= all_data.sample(frac=1)
    
    all_data.to_csv("mscg_new_pos_neg_test.tsv", sep="\t", header=None, index=None)
    
    all_data.set_index("concept", drop=False)
    
    for concept in all_data["concept"].unique():
        
        print ("Train Data -------------")
        print (train_df[train_df["concept"] == concept])
        print ("Test Data-----------")
        print (all_data[all_data["concept"] == concept])
        print ("++++++++++++++")
        print ()
    
    return all_data
    
test_pos_neg_data = generate_pos_neg_test_data(test_df)

In [None]:
train_test_overlap_df = pd.merge(train_df, test_pos_neg_data, how="inner", on = ["concept", "property"])

In [None]:
train_test_overlap_df

In [None]:
test_train_overlap_df = pd.merge(test_pos_neg_data, train_df, how="inner", on = ["concept", "property"])

In [None]:
test_train_overlap_df

In [None]:
train_valid_overlap_df = pd.merge(train_df, valid_df, how="inner", on = ["concept", "property"])

In [None]:
train_valid_overlap_df

In [None]:
test_valid_overlap_df = pd.merge(test_df, valid_df, how="inner", on = ["concept", "property"])

In [None]:
test_valid_overlap_df

In [None]:
valid_df

In [None]:
test_df

In [None]:
!pwd

In [None]:
train_df["concept"].unique()

In [None]:
np.sort(train_df["concept"].unique())

In [None]:
np.sort(test_df["concept"].unique())

In [None]:
train_df.dtypes

In [None]:
def generate_negative_data(data_df, num_neg_pair=1):
    
    data_df.reset_index(drop=True, inplace=True)
    
    #print ('data_df.index :',data_df.index)
        
    negative_data = []
    
    concept_list = data_df['concept'].unique()

    for concept in concept_list:
        
        negative_data_for_concept = []

        positive_data = data_df[data_df["concept"] == concept]
        #positive_data_index = data_df[data_df["concept"] == concept].index
        positive_data_index = np.array(positive_data.index)
        num_positive_data = len(positive_data_index)

        rest_df = data_df.drop(positive_data_index)
        rest_df.reset_index(drop=True, inplace=True)
        rest_df_idx = rest_df.index
        
        pos_or_negative = np.random.choice(['neg_concept', 'neg_property'], size=len(positive_data_index))
        
        num_neg_concept = Counter(pos_or_negative).get('neg_concept')
        num_neg_property = Counter(pos_or_negative).get('neg_property')
                
        print ('\n')
        print ('--' * 50 )
        print ('Concept :', concept)
        print ('len(positive_data):', len(positive_data))
        print ('num_neg_concept :',  num_neg_concept)
        print ('num_neg_property :', num_neg_property)
        print ('postitive_data_index :', positive_data_index)
        print ('\npositive_data:')
        print (positive_data)
        
        neg_concept_data = []
        neg_property_data = []
        
        
        # indexes of the records for which concept will be corrupted.
        if num_neg_concept is not None:
            
            neg_concept_record_idx = np.random.choice(positive_data_index, size=num_neg_concept)
            neg_concept = rest_df.sample(n = num_neg_pair * num_neg_concept)['concept'].to_numpy()
            neg_concept_split = np.split(neg_concept, len(neg_concept_record_idx))
            
            print ('\nneg_concepts')
            print (neg_concept)
            
            for array, idx in zip(neg_concept_split, neg_concept_record_idx):
                neg_concept_data.extend([(con, positive_data.loc[idx]['property'], 0) for con in array])                
        else:
            print ('num_neg_concept in None')
            neg_concept_record_idx = []
        
        print ('Negative concept for true property')
        print (neg_concept_data)

                
        if num_neg_property is not None:
            
            # indexes of the records for which property will be corrupted.
            #neg_property_record_idx = np.array([idx for idx in positive_data_index if idx not in neg_concept_record_idx])
            
            neg_property = rest_df.sample(n=num_neg_pair * num_neg_property)['property'].to_list()
            print ('\nneg_property :', neg_property)
            
            neg_property_data.extend([(concept, neg_prop, 0) for neg_prop in neg_property])
            
        print ('Negative Property data for true Concept :')
        print (neg_property_data)
            
        negative_data_for_concept.extend(neg_concept_data)
        negative_data_for_concept.extend(neg_property_data)
        
        print ('\nNumber of Negative Data For Concept :', len(negative_data_for_concept))
        print ('negative_data_for_concept:')
        print (negative_data_for_concept)
        
        negative_data.append(negative_data_for_concept)
        

    negative_data_df = pd.concat([pd.DataFrame(data, columns=['concept', 'property', 'label']) \
                                  for data in negative_data], ignore_index=True)
    
    
    all_data = pd.concat([data_df, negative_data_df], ignore_index=True)
    
    all_data = all_data.sample(frac=1)
    all_data.drop_duplicates(inplace=True)
    
    print ('\nFinished Generating Data For One Set')
    print ('len(all_data) :', len(all_data))
    
    return all_data


In [None]:
print ('#' * 50)
print ('Generating Negative Train Data')

In [None]:
print ('\n Training Data')

In [None]:
pos_neg_train_df = generate_negative_data(train_df, num_neg_pair=num_neg_pair)
pos_neg_train_df.to_csv('data/65k_train_ms_concept_graph.tsv', sep='\t', index=None, header=None)

In [None]:
print ()
print ('#' * 5)
print ('Train Record Before Negative Data:', len(train_df))
print ('Train Record After Negative Data:', len(pos_neg_train_df))

In [None]:
print ('#' * 50)
print ('Generating Negative Valid Data')
print ('\n Validation Data')
pos_neg_val_df = generate_negative_data(valid_df, num_neg_pair=num_neg_pair)
pos_neg_val_df.to_csv('data/65k_valid_ms_concept_graph.tsv', sep='\t', index=None, header=None)

In [None]:
print ()
print ('#' * 5)
print ('Validation Record Before Negative Data:', len(valid_df))
print ('Validation Record After Negative Data:', len(pos_neg_val_df))

In [None]:
print ('#' * 50)
print ('Generating Negative Test Data')

print ('\n Test Data')
pos_neg_test_df = generate_negative_data(test_df, num_neg_pair=num_neg_pair)
pos_neg_test_df.to_csv('data/65k_test_ms_concept_graph.tsv', sep='\t', index=None, header=None)

In [None]:
print ()
print ('#' * 5)
print ('Test Record Before Negative Data:', len(test_df))
print ('Test Record After Negative Data:', len(pos_neg_test_df))