In [3]:
import json
import math
import pandas as pd
import numpy as np
from itertools import combinations
from nltk.corpus import wordnet
from IPython.display import display

In [4]:
#preloading all schemas and categories
all_schemas={}
all_categories={}
with open("final_schema.txt") as ip_file:
    for line in ip_file.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]=json_obj["schema"]
        all_categories[json_obj["filename"]]=json_obj["categories"]
        
#preloading the candidate keys
with open("Candidate_key_dict.txt",'r') as ip_file:
    cand_key=json.load(ip_file)

#preloading column and category similarity values of tables
with open("cos_similarity.txt",'r') as ip_file:
    json_object=json.load(ip_file)
col_sim = json_object["column_similarity"]
cat_sim = json_object["category_similarity"]

#making a list of transformation functions
transform_funct_list=['average','sum','maximum','minimum','range','median','variance','standard deviation','mode','frequency']
tf_onto=generate_list_ontology(transform_funct_list)
tf_onto["average"].append("avg")

In [5]:
# all functions needed to generate ontologies
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)

# takes input list and returns ontology as dictionary with every word in list as the key
def generate_list_ontology(list1):
    ontology={}
    for word in list1:
        ontology[word]=get_synonyms(word)
    return ontology

# takes the input as a schema and returns ontology for every column in the schema
def generate_schema_ontology(input_schema):
    ontology={}
    for col in input_schema:
        ontology[col]=get_synonyms(col)
    return ontology

In [7]:
# to generate cos similarity between two lists
def cos_sim(list1, list2):
    terms = set(list1).union(list2)
    intersect = set(list1) & set(list2)
    others = (set(list1)-intersect).union(set(list2)-intersect)
    product=0
    for word in terms:
        if word in intersect:
            product+=1
    l1mag = math.sqrt(len(list1))
    l2mag = math.sqrt(len(list2))
    if len(list1)==0 or len(list2)==0:
        return 0.0
    else:
        return product / (l1mag * l2mag)

In [44]:
# both functions return all posssible tables with one or more columns that match with input schema(or its ontology)

# if input has only schema(columns and their dataypes)
def col_only_list(input_schema,input_sch_onto):
    possible_tables={}
    for file in all_schemas:
        schema=all_schemas[file]
        for col,d_type in schema.items():
            if (col in input_schema) and (input_schema[col]==d_type):
                if file in possible_tables:
                    possible_tables[file].append(col)
                else:
                    possible_tables[file]=[]
                    possible_tables[file].append(col)
            else:
                for a in input_sch_onto:
                    if (col in input_sch_onto[a]) and (input_schema[a]==d_type):
                        if file in possible_tables:
                            possible_tables[file].append(a)
                        else:
                            possible_tables[file]=[]
                            possible_tables[file].append(a)
    return possible_tables

# if input has categories as well as schema(columns and their dataypes)
# we consider it a match under the assumption that at least 75% category match exists
def cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto):
    possible_tables={}
    for file in all_categories:
        cat_list=[]
        category=all_categories[file]
        for cat in category:
            if cat in input_categories:
                cat_list.append(cat)
            else:
                for cat1 in input_cat_onto:
                    if cat in input_cat_onto[cat1]:
                        cat_list.append(cat1)
        cos_val=cos_sim(cat_list,input_categories)
        if cos_val > 0.75 :
            schema=all_schemas[file]
            for col,d_type in schema.items():
                if (col in input_schema) and (input_schema[col]==d_type):
                    if file in possible_tables:
                        possible_tables[file].append(col)
                    else:
                        possible_tables[file]=[]
                        possible_tables[file].append(col)
                else:
                    for a in input_sch_onto:
                        if (col in input_sch_onto[a]) and (input_schema[a]==d_type):
                            if file in possible_tables:
                                possible_tables[file].append(a)
                            else:
                                possible_tables[file]=[]
                                possible_tables[file].append(a)
    return possible_tables

In [10]:
# function to merge two tables
# it generates all matching columns btween two input filenames and calls merge function to merge the tables
def merge(fname1,fname2):
    matching_columns = {}  #resultant list containing the matching columns.
    #getting schema for tables.
    f1 = all_schemas[fname1]
    f2 = all_schemas[fname2]
    #Retrieving the column names and generating the ontologies for one of the tables columns
    f1_cols = list(f1)
    f1_onto = generate_list_ontology(f1_cols)
    #finding columns that match.     
    for col , d_type in f2.items():
        if (col in f1_cols) and (d_type==f1[col]):
            matching_columns[col] = col
        else:
            for col_t1 in f1_onto:
                if (col in f1_onto[col_t1]) and (d_type==f1[col_t1]):
                    matching_columns[col_t1] = col
                    break
    #Now the varaible matching_columns contains a list of names of columns that match between the two tables.
    t1 = cand_key[fname1]
    t2 = cand_key[fname2]
    mat_cols={}
    for key , value in matching_columns.items():
        if(key in t1 or value in t2):
            mat_cols[key]=value
    return merge_tables(fname1,fname2,mat_cols)

# it is called by merge and does the actual merging
def merge_tables(fname1,fname2,cols):
    t1 = pd.read_csv(fname1)
    t2 = pd.read_csv(fname2)
    print(fname1+' :')
    display(t1)
    print(fname2+' :')
    display(t2)
    print(fname1+' and '+fname2+' gives : ')
    l=len(cols)
    if l!=0:
        for name1,name2 in cols.items():
            t2.rename(columns = {name2:name1},inplace=True)
    t3=t1.merge(t2,how='outer')
    display(t3)
    return t3

In [27]:
# function generates all possible combinations of list l taking elements n to 2 at a time and returns a dictionary
def generate_all_combinations(l):
    x={}
    a=len(l)
    for i in range(a,1,-1):
        x[i]=list(combinations(l,i))
    return x

# function generates all matching columns between the res_cols schema and columns of table in fname
def generate_matching_columns(res_cols,fname):
    a=all_schemas[fname]
    matching_columns={}
    res_onto=generate_schema_ontology(res_cols)
    for col , d_type in a.items():
        if (col in res_cols) and (d_type==res_cols[col]):
            matching_columns[col] = col
        else:
            for col_res in res_onto:
                if (col in res_onto[col_res]) and (d_type==res_cols[col_res]):
                    matching_columns[col] = col_res
                    break
    return matching_columns

# returns a merged table of all tables given in input list l
def merge_list(l):
    t1=pd.read_csv(l[0])
    t2=pd.read_csv(l[1])
    a=all_schemas[l[0]]
    b=all_schemas[l[1]]
    matching_columns={}
    a_onto=generate_schema_ontology(a)
    for col , d_type in b.items():
        if (col in a) and (d_type==a[col]):
            matching_columns[col] = col
        else:
            for col_a in a_onto:
                if (col in a_onto[col_a]) and (d_type==a[col_a]):
                    matching_columns[col] = col_a
                    break
    t2.rename(columns = matching_columns,inplace=True)
    res=t1.merge(t2,how='outer')
    for fname in l[2:]:
        d_types=[]
        res_cols={}
        for i in res.dtypes:
            d_types.append(str(i))
        for i,j in zip(res.columns,d_types):
            res_cols[i]=j
        matching_columns=generate_matching_columns(res_cols,fname)
        t=pd.read_csv(fname)
        t.rename(columns = matching_columns,inplace=True)
        res=res.merge(t,how='outer')
    return res

In [11]:
# gives the nan score(no on nans/no of entries in table) for each column in the input table
def nan_score_table(table):
    nan_count={}
    a=len(table)
    for i in table.columns:
        x=a-table[i].count()
        s=str(x)+'/'+str(a)
        nan_count[i]=s
    return nan_count

# gives the nan score(no on nans/no of entries in table) for each column in the table in the input parameter fname
def nan_score_fname(fname):
    table=pd.read_csv(fname)
    nan_count={}
    a=len(table)
    for i in table.columns:
        x=a-table[i].count()
        s=str(x)+'/'+str(a)
        nan_count[i]=s
    return nan_count

In [68]:
# this is a function to print the individual tables, their nan score as well as match column with input schema
def display_individual_matches(matching_tables,matching_tables_dict,transform_cols={}):
    if len(transform_cols)==0:
        flag=0
    else:
        flag=1
    with open("output_folder_transformation/results.txt",'a') as f:
        for i in matching_tables:
            print(i,file=f)
            if flag==1:
                l={}
                cols=pd.read_csv(i).columns
                for col,tran in transform_cols.items():
                    if col in cols:
                        l[col]=tran
                print("possible transformations are : ",l,file=f)
            print('Missing Values(NANs score): ',nan_score_fname(i),file=f)
            print("Columns that match with input_schema:\n "+i+' : ',matching_tables_dict[i],file=f)
#             print('Column Match Score: '+str(len(matching_tables_dict[i]))+'/'+str(len(input_schema)))
            print(file=f)

In [14]:
def check_possible_matches():
    with open("input_tranformations.txt",'r') as ip_file:
        json_object=json.load(ip_file)
    input_schema=json_object["schema"]
    input_sch_onto=generate_schema_ontology(input_schema)
    if "categories" in json_object:
        print('category and schema')
        input_categories=json_object["categories"]
        input_cat_onto=generate_list_ontology(input_categories)
        possible_tables=cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto)
    else:
        print('only schema')
        possible_tables=col_only_list(input_schema,input_sch_onto)
    matching_tables={}
    for i in possible_tables:
        cos_val=cos_sim(possible_tables[i],list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    return matching_tables

In [60]:
# the main function to be called to generate all direct matches with the schema present in input.txt
# this calls all our other functions and generates all possible direct matches(including by merging tables) with the input schema
# direct matches imply transformations are not considered
def get_matches():
    with open("output_folder_transformation/results.txt",'a') as f:
        print('------------------------------',file=f)
        print(file=f)
        print("Direct Match",file=f)
        print(file=f)
    matching_tables_dict=check_possible_matches()
    matching_tables=list(matching_tables_dict)
    op_str1='output_folder_transformation/'
    op_str2='.csv'
    x=len(matching_tables)
    print(matching_tables)
    if x==0:
        with open("output_folder_transformation/results.txt",'a') as f:
            print("NO MATCHES FOUND",file=f)
            print(file=f)
    elif x==1:
        display_individual_matches(matching_tables,matching_tables_dict)
    elif x==2:
        a=matching_tables[0]+' : '+matching_tables[1]
        b=matching_tables[1]+' : '+matching_tables[0]
        if (a in cat_sim) or (b in cat_sim):
            if (cat_sim[a]>.50 and col_sim[a]>0.50) or (cat_sim[b]>.50 and col_sim[b]>0.50):
                res=merge(matching_tables[0],matching_tables[1])
                op_string=op_str1+'1t'+op_str2
                res.to_csv(op_string,sep=',', index=False)
                display_individual_matches(matching_tables,matching_tables_dict)
    else:
        ctr=0
        count_comb=len(matching_tables)
        all_combos=generate_all_combinations(matching_tables)
        for i in range(count_comb,1,-1):
            for l in all_combos[i]:
                a=list(l)
                comb=list(combinations(a,2))
                flag=0
                for pair in comb:
                    if flag==0:
                        t1,t2=pair
                        if ((t1+' : '+t2) in cat_sim):
                            if (cat_sim[t1+' : '+t2]>=.50 and col_sim[t1+' : '+t2]>=0.50):
                                pass
                            else:
                                flag=1
                        else:
                            flag=1
                    else:
                        break
                if flag==0:
                    res=merge_list(a)
                    ctr+=1
                    op_string=op_str1+str(ctr)+op_str2
                    res.to_csv(op_string,sep=',', index=False)
                    with open("output_folder_transformation/results.txt",'a') as f:
                        print(str(ctr)+op_str2+' : ',end='',file=f)
                        for j in a:
                            print(j+'\t\t',end='',file=f)
                        print(file=f)
                        print('Missing Values(NANs): ',nan_score_table(res),file=f)
                        print("Columns that match with input_schema: ",file=f)
                        for j in a:
                            print(j+' : ',matching_tables_dict[j],file=f)
                        print(file=f)
        display_individual_matches(matching_tables,matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('------------------------------',file=f)

In [12]:
def check_possible_matches_transform():
    with open("input_tranformations.txt",'r') as ip_file:
        ip_schema=json.load(ip_file)
        ip_schema=ip_schema["schema"]
    cols={}
    transform={}
    for i,d_type in ip_schema.items():
        x=i.split()
        if len(x)==1:
            cols[i]=d_type
        else:
            for a in x:
                a1=a.lower()
                if a1 in transform_funct_list:
                    x.remove(a)
                    str1=x[0]
                    for w in x[1:]:
                        str1+=" "+w
                    cols[str1]=d_type
                    transform[str]=a1
                else:
                    for c in tf_onto:
                        if a1 in tf_onto[c]:
                            x.remove(a)
                            str1=x[0]
                            for w in x[1:]:
                                str1+=" "+w
                            cols[str1]=d_type
                            transform[str1]=a1
    input_schema=cols
    input_sch_onto=generate_schema_ontology(input_schema)
    if "categories" in json_object:
        print('category and schema')
        input_categories=json_object["categories"]
        input_cat_onto=generate_list_ontology(input_categories)
        possible_tables=cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto)
    else:
        print('only schema')
        possible_tables=col_only_list(input_schema,input_sch_onto)
    matching_tables={}
    for i in possible_tables:
        cos_val=cos_sim(possible_tables[i],list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    return (matching_tables,transform)

In [66]:
def get_transform_matches():
    (matching_tables_dict,transform_cols)=check_possible_matches_transform()
    matching_tables=list(matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)
        print(file=f)
        print("Transformation Match",file=f)
        print(file=f)
        if len(transform_cols)==0:
            print("No Transformations In Input Schema",file=f)
            print(file=f)
            return
        else:
            print('Transformations detected are : ',transform_cols,file=f)
            print(file=f)
    op_str1='output_folder_transformation/'
    op_str2='t.csv'
    x=len(matching_tables)
    print(matching_tables)
    if x==0:
        with open("output_folder_transformation/results.txt",'a') as f:
            print("NO MATCHES FOUND",file=f)
            print(file=f)
    elif x==1:
        display_individual_matches(matching_tables,matching_tables_dict)
    elif x==2:
        a=matching_tables[0]+' : '+matching_tables[1]
        b=matching_tables[1]+' : '+matching_tables[0]
        if (a in cat_sim) or (b in cat_sim):
            if (cat_sim[a]>.50 and col_sim[a]>0.50) or (cat_sim[b]>.50 and col_sim[b]>0.50):
                res=merge(matching_tables[0],matching_tables[1])
                op_string=op_str1+'1t'+op_str2
                res.to_csv(op_string,sep=',', index=False)
                display_individual_matches(matching_tables,matching_tables_dict)
    else:
        ctr=0
        count_comb=len(matching_tables)
        all_combos=generate_all_combinations(matching_tables)
        for i in range(count_comb,1,-1):
            for l in all_combos[i]:
                a=list(l)
                comb=list(combinations(a,2))
                flag=0
                for pair in comb:
                    if flag==0:
                        t1,t2=pair
                        if ((t1+' : '+t2) in cat_sim):
                            if (cat_sim[t1+' : '+t2]>=.50 and col_sim[t1+' : '+t2]>=0.50):
                                pass
                            else:
                                flag=1
                        else:
                            flag=1
                    else:
                        break
                if flag==0:
                    res=merge_list(a)
                    ctr+=1
                    op_string=op_str1+str(ctr)+op_str2
                    res.to_csv(op_string,sep=',', index=False)
                    l={}
                    cols=res.columns
                    for col,tran in transform_cols.items():
                        if col in cols:
                            l[col]=tran
                    with open("output_folder_transformation/results.txt",'a') as f:
                        print(str(ctr)+op_str2+' : ',end='',file=f)
                        for j in a:
                            print(j+'\t\t',end='',file=f)
                        print(file=f)
                        print("possible transformations are : ",l,file=f)
                        print('Missing Values(NANs): ',nan_score_table(res),file=f)
                        print("Columns that match with input_schema: ",file=f)
                        for j in a:
                            print(j+' : ',matching_tables_dict[j],file=f)
                        print(file=f)
        display_individual_matches(matching_tables,matching_tables_dict,transform_cols=transform_cols)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)

In [69]:
get_matches()

only schema
['202_117.csv', '203_148.csv', '203_268.csv', '204_100.csv']


In [70]:
get_transform_matches()

only schema
['202_117.csv', '203_148.csv', '203_268.csv', '204_100.csv']
