In [1]:
import torch
import pandas as pd
import numpy as np

from collections import Counter

In [2]:
#file_path = "/scratch/c.scmag3/conceptEmbeddingModel/data/data-concept-instance-relations.txt" 
file_path = "./../../data/ms_concept_graph/data-concept-instance-relations.txt"
data_df = pd.read_csv(file_path, header=None, names=["property", "concept", "num_relation"], sep="\t")

In [3]:
print ('Data After Loading MS Concept Graph')
data_df

Data After Loading MS Concept Graph


Unnamed: 0,property,concept,num_relation
0,factor,age,35167
1,free rich company datum,size,33222
2,free rich company datum,revenue,33185
3,state,california,18062
4,supplement,msm glucosamine sulfate,15942
...,...,...,...
33377315,popular legacy datum structure,binary search tree,1
33377316,norwegian food,lefse,1
33377317,freeze skill,new fot,1
33377318,enhanced enforcement initiative,monthly impact inspections of problem mine,1


In [4]:
data_df.sort_values('num_relation', ascending=False, inplace=True)

In [5]:
print ('Data After Sorting on Num Relation')
data_df

Data After Sorting on Num Relation


Unnamed: 0,property,concept,num_relation
0,factor,age,35167
1,free rich company datum,size,33222
2,free rich company datum,revenue,33185
3,state,california,18062
4,supplement,msm glucosamine sulfate,15942
...,...,...,...
18841597,permanent control measure,straw wattle,1
18841596,permanent control measure,check dam,1
18841595,permanent control measure,vegetative buffer,1
18841594,permanent control measure,ditch lining,1


In [6]:
data_df = data_df[['concept', 'property', 'num_relation']]
data_df

Unnamed: 0,concept,property,num_relation
0,age,factor,35167
1,size,free rich company datum,33222
2,revenue,free rich company datum,33185
3,california,state,18062
4,msm glucosamine sulfate,supplement,15942
...,...,...,...
18841597,straw wattle,permanent control measure,1
18841596,check dam,permanent control measure,1
18841595,vegetative buffer,permanent control measure,1
18841594,ditch lining,permanent control measure,1


In [7]:
top_k = 15000

In [8]:
data_df = data_df[0:top_k]
print (f'Data Df after taking top {top_k} records \n:')
data_df

Data Df after taking top 15000 records 
:


Unnamed: 0,concept,property,num_relation
0,age,factor,35167
1,size,free rich company datum,33222
2,revenue,free rich company datum,33185
3,california,state,18062
4,msm glucosamine sulfate,supplement,15942
...,...,...,...
15045,dentist,health professional,191
15044,temperature,meteorological variable,191
15043,toluene,volatile organic compound,191
15042,football,competitive sport,191


In [9]:
unique_concept_list = data_df["concept"].unique()
# np.random.shuffle(unique_concept_list)
print (unique_concept_list.shape[0])
print (type(unique_concept_list))

6006
<class 'numpy.ndarray'>


In [10]:
num_unique_concepts = len(unique_concept_list)
print ('Num Unique Positive Concepts:', num_unique_concepts)

Num Unique Positive Concepts: 6006


In [11]:
print (data_df["concept"].value_counts())

age                             71
water                           52
temperature                     52
diabetes                        45
education                       44
                                ..
generalized anxiety disorder     1
exxonmobil                       1
genetic background               1
family income                    1
tree removal                     1
Name: concept, Length: 6006, dtype: int64


In [12]:
train_ratio, val_ratio, test_ratio = 0.85, 0.10, 0.05

In [13]:
num_train_concepts = int(num_unique_concepts * train_ratio)
num_val_concepts = int(num_unique_concepts * val_ratio)
num_test_concepts = int(num_unique_concepts * test_ratio)

print (num_train_concepts)
print (num_val_concepts)
print (num_test_concepts)


5105
600
300


In [14]:
train_concept = unique_concept_list[0:num_train_concepts]
valid_concept = unique_concept_list[num_train_concepts:num_train_concepts + num_val_concepts]
test_concept = unique_concept_list[num_train_concepts + num_val_concepts:]

print ('Number of Positive Train Concept :', len(train_concept))
print ('Number of Positive Valid Concept :', len(valid_concept))
print ('Number of Positive Test Concept :', len(test_concept))


Number of Positive Train Concept : 5105
Number of Positive Valid Concept : 600
Number of Positive Test Concept : 301


print ('\nTrain Concept :', *train_concept, sep=", ")
print ('\nValid Concept :', *valid_concept, sep=", ")
print ('\nTest Concept :', *test_concept, sep=", ")


In [15]:
print ('Total Number of Positive Concepts :', num_train_concepts + num_val_concepts + num_test_concepts)

Total Number of Positive Concepts : 6005


In [16]:
print ('+' * 50)

print ('Asserting Train Concept, Valid Concepts and Test Concpets Do Not Overlap')
assert set (train_concept) != set (valid_concept)
assert set (train_concept) != set (test_concept)
assert set (valid_concept) != set (test_concept)

print ('Assertions Passed')

++++++++++++++++++++++++++++++++++++++++++++++++++
Asserting Train Concept, Valid Concepts and Test Concpets Do Not Overlap
Assertions Passed


In [17]:
train_df = data_df[data_df['concept'].isin(train_concept)]
valid_df = data_df[data_df['concept'].isin(valid_concept)]
test_df = data_df[data_df['concept'].isin(test_concept)]

print ('Number of Train Records :', len(train_df))
print ('Number of Valid Records :', len(valid_df))
print ('NUmber of Test Records :', len(test_df))

print ('+' * 50)

Number of Train Records : 14032
Number of Valid Records : 661
NUmber of Test Records : 307
++++++++++++++++++++++++++++++++++++++++++++++++++


In [18]:
print ('\ncheck if train, val and test dataframes have any overlap')
df = train_df.merge(test_df, how = 'inner' ,indicator=False)
print('\n', df)

df = train_df.merge(valid_df, how = 'inner' ,indicator=False)
print('\n', df)

df = test_df.merge(valid_df, how = 'inner', indicator=False)
print('\n', df)


check if train, val and test dataframes have any overlap

 Empty DataFrame
Columns: [concept, property, num_relation]
Index: []

 Empty DataFrame
Columns: [concept, property, num_relation]
Index: []

 Empty DataFrame
Columns: [concept, property, num_relation]
Index: []


In [19]:
train_df = train_df.drop("num_relation", axis=1)

In [20]:
train_df.to_csv("mscg_new_pos_train.tsv", sep="\t", index=None, header=None)

In [21]:
valid_df = valid_df.drop("num_relation", axis=1)

In [22]:
valid_df.to_csv("mscg_new_pos_valid.tsv", sep="\t", index=None, header=None)

In [23]:
def generate_pos_neg_test_data(test_df):
    
    # print (test_df.head())
    
    test_df.reset_index(drop=True, inplace=True)
    test_df = test_df.drop("num_relation", axis=1)
    test_df["label"] = 1
    
    test_df.set_index("concept", drop=False)
    
    print (test_df.head())
    print (test_df.shape)
    
    unique_test_concept = test_df["concept"].unique()
    
    print (len(unique_test_concept))
    
    negative_data = []
    for concept in unique_test_concept:
        
        temp_df = test_df[test_df["concept"] != concept]
        
        # print ("temp_df")
        # print (temp_df)
        
        sampled_property = np.random.choice(temp_df["property"].unique(), size=5, replace=False)
        
        # print ("sampled_property")
        # print (sampled_property)
        
        negative_data.append([(concept, prop, 0) for prop in sampled_property])
    
    
    # print ("negative_data")
    # print (len(negative_data))
    # print (negative_data)
    
    negative_data_df = pd.concat([pd.DataFrame(data, columns=['concept', 'property', 'label']) \
                                  for data in negative_data], ignore_index=True)

    print("negative_data_df")
    print (negative_data_df.shape)
    
    all_data = pd.concat([test_df, negative_data_df], ignore_index=True)
    
    print("All Data")
    print (all_data.shape)
    
    all_data= all_data.sample(frac=1)
    
    all_data.to_csv("mscg_new_pos_neg_test.tsv", sep="\t", header=None, index=None)
    
    all_data.set_index("concept", drop=False)
    
    for concept in all_data["concept"].unique():
        print (all_data[all_data["concept"] == concept])
        print ()

    
generate_pos_neg_test_data(test_df)

       concept property  label
0     goldfish     fish      1
1        armor     item      1
2  forgiveness    topic      1
3        sound    topic      1
4   scheduling    issue      1
(307, 3)
301
negative_data_df
(1505, 3)
All Data
(1812, 3)
      concept             property  label
132  palm oil        vegetable oil      1
967  palm oil  antioxidizing agent      0
971  palm oil          alkyl group      0
970  palm oil       funding source      0
969  palm oil              journal      0
968  palm oil              service      0

        concept            property  label
1325  dictation           pollutant      0
1323  dictation             keyword      0
203   dictation  musicianship skill      1
1322  dictation        light source      0
1326  dictation         soy product      0
1324  dictation       blood thinner      0

     concept          property  label
1402  baboon              word      0
1405  baboon  optical property      0
219   baboon            animal      1
1404  

In [None]:
train_df.dtypes

In [None]:
def generate_negative_data(data_df, num_neg_pair=1):
    
    data_df.reset_index(drop=True, inplace=True)
    
    #print ('data_df.index :',data_df.index)
        
    negative_data = []
    
    concept_list = data_df['concept'].unique()

    for concept in concept_list:
        
        negative_data_for_concept = []

        positive_data = data_df[data_df["concept"] == concept]
        #positive_data_index = data_df[data_df["concept"] == concept].index
        positive_data_index = np.array(positive_data.index)
        num_positive_data = len(positive_data_index)

        rest_df = data_df.drop(positive_data_index)
        rest_df.reset_index(drop=True, inplace=True)
        rest_df_idx = rest_df.index
        
        pos_or_negative = np.random.choice(['neg_concept', 'neg_property'], size=len(positive_data_index))
        
        num_neg_concept = Counter(pos_or_negative).get('neg_concept')
        num_neg_property = Counter(pos_or_negative).get('neg_property')
                
        print ('\n')
        print ('--' * 50 )
        print ('Concept :', concept)
        print ('len(positive_data):', len(positive_data))
        print ('num_neg_concept :',  num_neg_concept)
        print ('num_neg_property :', num_neg_property)
        print ('postitive_data_index :', positive_data_index)
        print ('\npositive_data:')
        print (positive_data)
        
        neg_concept_data = []
        neg_property_data = []
        
        
        # indexes of the records for which concept will be corrupted.
        if num_neg_concept is not None:
            
            neg_concept_record_idx = np.random.choice(positive_data_index, size=num_neg_concept)
            neg_concept = rest_df.sample(n = num_neg_pair * num_neg_concept)['concept'].to_numpy()
            neg_concept_split = np.split(neg_concept, len(neg_concept_record_idx))
            
            print ('\nneg_concepts')
            print (neg_concept)
            
            for array, idx in zip(neg_concept_split, neg_concept_record_idx):
                neg_concept_data.extend([(con, positive_data.loc[idx]['property'], 0) for con in array])                
        else:
            print ('num_neg_concept in None')
            neg_concept_record_idx = []
        
        print ('Negative concept for true property')
        print (neg_concept_data)

                
        if num_neg_property is not None:
            
            # indexes of the records for which property will be corrupted.
            #neg_property_record_idx = np.array([idx for idx in positive_data_index if idx not in neg_concept_record_idx])
            
            neg_property = rest_df.sample(n=num_neg_pair * num_neg_property)['property'].to_list()
            print ('\nneg_property :', neg_property)
            
            neg_property_data.extend([(concept, neg_prop, 0) for neg_prop in neg_property])
            
        print ('Negative Property data for true Concept :')
        print (neg_property_data)
            
        negative_data_for_concept.extend(neg_concept_data)
        negative_data_for_concept.extend(neg_property_data)
        
        print ('\nNumber of Negative Data For Concept :', len(negative_data_for_concept))
        print ('negative_data_for_concept:')
        print (negative_data_for_concept)
        
        negative_data.append(negative_data_for_concept)
        

    negative_data_df = pd.concat([pd.DataFrame(data, columns=['concept', 'property', 'label']) \
                                  for data in negative_data], ignore_index=True)
    
    
    all_data = pd.concat([data_df, negative_data_df], ignore_index=True)
    
    all_data = all_data.sample(frac=1)
    all_data.drop_duplicates(inplace=True)
    
    print ('\nFinished Generating Data For One Set')
    print ('len(all_data) :', len(all_data))
    
    return all_data


In [None]:
print ('#' * 50)
print ('Generating Negative Train Data')

In [None]:
print ('\n Training Data')

In [None]:
pos_neg_train_df = generate_negative_data(train_df, num_neg_pair=num_neg_pair)
pos_neg_train_df.to_csv('data/65k_train_ms_concept_graph.tsv', sep='\t', index=None, header=None)

In [None]:
print ()
print ('#' * 5)
print ('Train Record Before Negative Data:', len(train_df))
print ('Train Record After Negative Data:', len(pos_neg_train_df))

In [None]:
print ('#' * 50)
print ('Generating Negative Valid Data')
print ('\n Validation Data')
pos_neg_val_df = generate_negative_data(valid_df, num_neg_pair=num_neg_pair)
pos_neg_val_df.to_csv('data/65k_valid_ms_concept_graph.tsv', sep='\t', index=None, header=None)

In [None]:
print ()
print ('#' * 5)
print ('Validation Record Before Negative Data:', len(valid_df))
print ('Validation Record After Negative Data:', len(pos_neg_val_df))

In [None]:
print ('#' * 50)
print ('Generating Negative Test Data')

print ('\n Test Data')
pos_neg_test_df = generate_negative_data(test_df, num_neg_pair=num_neg_pair)
pos_neg_test_df.to_csv('data/65k_test_ms_concept_graph.tsv', sep='\t', index=None, header=None)

In [None]:
print ()
print ('#' * 5)
print ('Test Record Before Negative Data:', len(test_df))
print ('Test Record After Negative Data:', len(pos_neg_test_df))