In [3]:
# import necessary libraries and packages
import json
import math
import pandas as pd
import numpy as np
import urllib
from itertools import combinations
from nltk.corpus import wordnet
from IPython.display import display

In [4]:
#preloading all schemas and categories
all_schemas={}
all_categories={}
all_tablenames={}
with open("final_schema.txt") as ip_file:
    for line in ip_file.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]=json_obj["schema"]
        all_categories[json_obj["filename"]]=json_obj["categories"]
        all_tablenames[json_obj["filename"]]=json_obj["tablename"]
#preloading column and category similarity values of tables
with open("cos_similarity.txt",'r') as ip_file:
    json_object=json.load(ip_file)
    col_sim = json_object["column_similarity"]
    cat_sim = json_object["category_similarity"]

In [5]:
# all functions needed to generate ontologies

# generates all synonyms and hypernyms of a term using wordnet
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)

# takes input list and returns ontology as dictionary with every word in list as the key
def generate_ontology(list1):
    ontology={}
    for word in list1:
        ontology[word]=get_synonyms(word)
    return ontology

In [6]:
# making a list of transformation functions
tf_onto={}
transform_funct_list={"int64":['average','sum','maximum','minimum','range','median','variance','standard deviation','mode','frequency','avg'],"float64":['average','sum','maximum','minimum','range','median','variance','standard deviation','mode','frequency','avg'],"object":["tolower","toupper","substring","funct1","funct2","funct3"]}
for dtype,funct_list in transform_funct_list.items():
    tf_onto[dtype]=generate_ontology(funct_list)
# initialising knowledge graph data dictionary
kg_data={}
# making a list of all statistical functions and the various signs and combinations 
signs = ["<=",">=","<",">","=","!="]; functs =  ['count','mean','standard deviation','min','max','25%','50%','75%']; functs_onto = generate_ontology(functs)
functs_onto['25%'].append("first quartile");functs_onto['75%'].append("third quartile");functs_onto['50%'].append("second quartile");functs_onto['min'].append("minimum");functs_onto['max'].append("maximum")

In [7]:
# to generate cos similarity between two lists
def cos_sim(list1, list2):
    terms = set(list1).union(list2)
    intersect = set(list1) & set(list2)
    others = (set(list1)-intersect).union(set(list2)-intersect)
    product=0
    for word in terms:
        if word in intersect:
            product+=1
    l1mag = math.sqrt(len(list1))
    l2mag = math.sqrt(len(list2))
    if len(list1)==0 or len(list2)==0:
        return 0.0
    else:
        return product / (l1mag * l2mag)

In [45]:
# returns a dictionary of all tables having one or more columns mstching directly with the input schema
def direct_matches(input_schema):
    possible_tables={}
    for file in all_schemas:
        schema=all_schemas[file]
        cols={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
        if(len(cols)>1):
            possible_tables[file]=cols
    return possible_tables

# returns a dictionary of all tables having one or more columns mstching directly, or having an ontology match 
# or a substring match with the input schema
def extended_matches(input_schema):
    possible_tables={}
    ip_schema=input_schema
    for file in all_schemas:
        schema=all_schemas[file]
        cols1={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
        schema={k:v for k,v in schema.items() if k not in cols1}
        input_schema={k:v for k,v in input_schema.items() if k not in list(cols1.values())}
        input_sch_onto=generate_ontology(input_schema)
        cols2={col1:col for col1,d_type1 in schema.items() for col in input_sch_onto if col1 in input_sch_onto[col] and d_type1==input_schema[col]}
        schema={k:v for k,v in schema.items() if k not in cols2}
        input_schema={k:v for k,v in input_schema.items() if k not in list(cols2.values())}
        schema_onto=generate_ontology(schema)
        cols3={col:col1 for col1,d_type1 in input_schema.items() for col in schema_onto if col1 in schema_onto[col] and d_type1==schema[col]}
        schema={k:v for k,v in schema.items() if k not in cols3}
        input_schema={k:v for k,v in input_schema.items() if k not in list(cols3.values())}
        cols={**cols1,**cols2,**cols3}
        for col1,d_type1 in input_schema.items():
            x=[col2 for col2,d_type2 in schema.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
            if len(x)==1:
                cols[x[0]]=col1
        if(len(cols)>1):
            possible_tables[file]=cols
        input_schema=ip_schema
    return possible_tables

# categories provided as input help narrow down the matches to contextually relevant tables
# categories and columns are matched based on direct match as well as ontology match
def categories_included(input_categories,input_schema):
    input_cat_onto=generate_ontology(input_categories)
    ip_schema=input_schema
    possible_tables={}
    for file in all_categories:
        category=all_categories[file]
        cats1=[cat1 for cat1 in category if cat1 in input_categories]
        category=[k for k in category if k not in cats1]
        cats2=[cat2 for cat1 in category for cat2 in input_cat_onto if cat1 in input_cat_onto[cat2]]
        cat_list=cats1+cats2
        cos_val=cos_sim(cat_list,input_categories)
        if cos_val > 0.75 :
            schema=all_schemas[file]
            cols1={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
            schema={k:v for k,v in schema.items() if k not in list(cols1.values())}
            input_schema={k:v for k,v in input_schema.items() if k not in list(cols1.values())}
            input_sch_onto=generate_ontology(input_schema)
            cols2={col1:col for col1,d_type1 in schema.items() for col in input_sch_onto if col1 in input_sch_onto[col] and d_type1==input_schema[col]}
            schema={k:v for k,v in schema.items() if k not in cols2}
            input_schema={k:v for k,v in input_schema.items() if k not in list(cols2.values())}
            schema_onto=generate_ontology(schema)
            cols3={col:col1 for col1,d_type1 in input_schema.items() for col in schema_onto if col1 in schema_onto[col] and d_type1==schema[col]}
            schema={k:v for k,v in schema.items() if k not in cols3}
            input_schema={k:v for k,v in input_schema.items() if k not in list(cols3.values())}
            cols={**cols1,**cols2,**cols3}
            for col1,d_type1 in input_schema.items():
                x=[col2 for col2,d_type2 in schema.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
                if len(x)==1:
                    cols[x[0]]=col1
            if(len(cols)>1):
                possible_tables[file]=cols
            input_schema=ip_schema
    return possible_tables

# return only those tables having 75% cos similarity with input table through direct match
def direct_matches_only():
    input_sch_onto=generate_ontology(input_schema)
    possible_tables=direct_matches(input_schema)
    matching_tables={}
    for i in possible_tables:
        cos_val=cos_sim(list(possible_tables[i].values()),list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    print("Matching Tables ")
    for i in list(matching_tables):
        print(i," : ",all_tablenames[i])
    print("\n")
    display_individual_matches(list(matching_tables),matching_tables)

# return only those tables having 75% cos similarity with input table through direct, ontology and substring match
def check_possible_extended_matches():
    input_sch_onto=generate_ontology(input_schema)
    possible_tables=extended_matches(input_schema)
    matching_tables={}
    for i in possible_tables:
        cos_val=cos_sim(list(possible_tables[i].values()),list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    print("Matching Tables")
    for i in list(matching_tables):
        print(i," : ",all_tablenames[i])
    print("\n")
    display_individual_matches(list(matching_tables),matching_tables)
    return matching_tables

# return tables having 75% column as well as category cos similarity
def check_possible_category_included_matches():
    input_sch_onto=generate_ontology(input_schema)
    possible_tables=categories_included(input_categories,input_schema)
    matching_tables={}
    for i in possible_tables:
        cos_val=cos_sim(list(possible_tables[i].values()),list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    print("Matching Tables")
    for i in list(matching_tables):
        print(i," : ",all_tablenames[i])
    print("\n")
    display_individual_matches(list(matching_tables),matching_tables)
    return matching_tables

In [9]:
# function generates all possible combinations of list l taking elements n to 2 at a time and returns a dictionary
def generate_all_combinations(l):
    x={}
    a=len(l)
    for i in range(a,1,-1):
        x[i]=list(combinations(l,i))
    return x

In [10]:
# for input table and column, get a list of all categories that the terms(elements) in that column belong to
def extract_info_from_knowledge_graphs(a,col1):
    with open("C:\\Users\\adith\\Desktop\\my_google_knowledge_graph_api_key.txt","r") as f:
        api_key=f.readline()
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {'limit': 50,'indent': True,'key': api_key}
    s={}
    ctr=0
    for row in a[col1]:
        if type(row)==type(np.nan):
            ctr+=1
            continue
        params['query']=row
        url = service_url + '?' + urllib.parse.urlencode(params)
        try:
            response = json.loads(urllib.request.urlopen(url).read())
        except urllib.error.HTTPError as httperr:
            try:
                response = json.loads(urllib.request.urlopen(url).read())
            except urllib.error.HTTPError as httperr:
                try:
                    response = json.loads(urllib.request.urlopen(url).read())
                except urllib.error.HTTPError as httperr:
                    ctr+=1
        for element in response['itemListElement']:
            for i in element["result"]["@type"]:
                b=i.lower()
                if b in s:
                    s[b]+=1
                else:
                    s[b]=1
            if "description" in element["result"]:
                b=element["result"]["description"].lower()
                if b in s:
                    s[b]+=1
                else:
                    s[b]=1
    return s,ctr

In [11]:
# using data from google knowledge graphs, try to obtain alternate names for columns that do not match to merge them if they
# represent the same entity
# returned values are in the form (final_name,old_name)
def get_alternate_col_name(a,b,col1,col2):
    global kg_data
    if col1 not in kg_data:
        groups,count=extract_info_from_knowledge_graphs(a,col1)
        kg_data[col1]={"groups":groups,"count":count}
    if col2 not in kg_data:
        groups,count=extract_info_from_knowledge_graphs(b,col2)
        kg_data[col2]={"groups":groups,"count":count}
    s1=kg_data[col1]["groups"]
    ctr1=kg_data[col1]["count"]
    s2=kg_data[col2]["groups"]
    ctr2=kg_data[col2]["count"]
    if (col1 in s1) and (col1 in s2):
        if (s1[col1]>=len(a)-1-ctr1) and (s2[col1]>=len(b)-1-ctr2):
            kg_data[col1]["groups"][col1]+=kg_data[col2]["groups"][col1]
            kg_data[col1]["count"]+=kg_data[col2]["count"]
            del kg_data[col2]
            return (col1,col2)
    elif (col2 in s1) and (col2 in s2):
        if (s1[col2]>=len(a)-1-ctr1) and (s2[col2]>=len(b)-1-ctr2):
            kg_data[col2]["groups"][col2]+=kg_data[col1]["groups"][col2]
            kg_data[col2]["count"]+=kg_data[col1]["count"]
            del kg_data[col1]
            return (col2,col1)
    else:
        return 0,0

In [12]:
# using google knowledge graph update the column names and update tables (if possible) to improve merge accuracy and improve completeness
def graph_match(c,t1,t2):
    l=list(combinations(c,2))
    possible_column_renames={}
    flag=0
    for i in l:
        if ((i[0] in t1) and (i[1] in t2)):
            a,b=get_alternate_col_name(t1,t2,i[0],i[1])
        elif ((i[1] in t1) and (i[0] in t2)):
            a,b=get_alternate_col_name(t1,t2,i[1],i[0])
        else:
            a,b=0,0
        if a!=0 and b!=0:
            if b in input_schema:
                if b in t1:
                    t2.rename(columns={a:b},inplace=True)
                else:
                    t1.rename(columns={a:b},inplace=True)
            else:
                if a in t1:
                    t2.rename(columns={b:a},inplace=True)
                else:
                    t1.rename(columns={b:a},inplace=True)
            c.remove(a)
            c.remove(b)
            return t1,t2,c
    return t1,t2,0

In [13]:
def gen_kg_data(c,a,res):
    global kg_data
    for col in c:
        if col in a:
            b=a
        else:
            b=res
        if col not in kg_data:
            groups,ctr=extract_info_from_knowledge_graphs(b,col)
            kg_data[col]={"groups":groups,"count":ctr}

In [14]:
# function generates all matching columns between the res_cols schema and columns of table in fname
def generate_matching_columns(res_cols,fname):
    a=all_schemas[fname]
    c={**res_cols,**a}
    res_onto=generate_ontology(res_cols)
    cols1={col1:col1 for col1,d_type1 in a.items() if (col1 in res_cols) and d_type1==res_cols[col1]}
    res_onto={k:v for k,v in res_onto.items() if k not in cols1}
    a={k:v for k,v in a.items() if k not in cols1}
    cols2={col1:cols for col1,d_type1 in a.items() for col in res_onto if col1 in res_onto[col] and d_type1==res_cols[col]}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[col1]=x[0]
    matching_columns=cols
    c={k:v for k,v in c.items() if k not in matching_columns and v=='object'}
    return (matching_columns,list(c))

In [69]:
# returns a merged table of all tables given in input list l    
def merge_list_knowledge_graph(l):
    t1=pd.read_csv(l[0])
    a=all_schemas[l[0]]
    matching_columns,c=generate_matching_columns(a,l[1])
    t2=pd.read_csv(l[1])
    gen_kg_data(c,t1,t2)
    if len(c)>=2:
        t1,t2,c=graph_match(c,t1,t2)
        while c!=0:
            t1,t2,c=graph_match(c,t1,t2)
    t2.rename(columns = matching_columns,inplace=True)
    try:
        res=t1.merge(t2,how='outer')
    except:
        return -1
    for fname in l[2:]:
        res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
        matching_columns,c=generate_matching_columns(res_cols,fname)
        t=pd.read_csv(fname)
        gen_kg_data(c,res,t)
        if len(c)>=2:
            res,t,c=graph_match(c,res,t)
            while c!=0:
                res,t,c=graph_match(c,res,t)
        t.rename(columns = matching_columns,inplace=True)
        try:
            res=res.merge(t,how='outer')
        except:
            return -1
    res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
    a=input_schema
    cols1={col1:col1 for col1,d_type1 in res_cols.items() if (col1 in a) and d_type1==input_schema[col1]}
    a={k:v for k,v in a.items() if k not in cols1}
    a_onto=generate_ontology(a)
    cols2={col1:col for col1,d_type1 in res_cols.items() for col in a_onto if col1 in a_onto[col] and (d_type1==a[col])}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[x[0]]=col1
    res.rename(columns = cols,inplace=True)
    return res

In [16]:
# metrics calculated :
# 1)nan_score(number of nulls in each column)
# 2)coverage_score(no of matching columns with input schema/total number of columns in input schema)
# 3)completeness_score(a combination of coverage and nan scores to determine how complete the result dataset is)

# nan score = {x : (no on nans in column/no of entries in column)} where x is each column in the table
# gives the nan score(no on nans/no of entries in table) for each column in the input table
def nan_score(table=-1,fname=-1):
    if fname!=-1:
        table=pd.read_csv(fname)
    nan_count={}
    a=len(table)
    for i in table.columns:
        x=a-table[i].count()
        s=str(x)+'/'+str(a)
        nan_count[i]=s
    return nan_count

#returns the coverage score and completeness score of a given table
#coverage score is calculated as : 
# coverage = (no of columns matching with input schema/total number of columns in input schema)
#completeness score is calculated as : 
# completeness = (sum(x*(non null entries)/(total entries in the column))/total number of columns in input schema) 
#  where x=1 if column present in input schema and x=0 if column is not present in the input schema
def coverage_and_completeness(table):
    ctr=0
    comp=0.0
    cols=table.columns
    l=len(table)
    for col in cols:
        if col in input_schema:
            ctr+=1
            comp+=((l-sum(pd.isnull(table[col])))/l)
    comp=comp/len(input_schema)
    cov=ctr/len(input_schema)
    return (cov,comp)

In [17]:
def ranking_display(comp_score,no_of_rows):
    print()
    comp_rank=sorted(comp_score,reverse=True)
    count=0
    l=sorted(no_of_rows, key=lambda k: no_of_rows[k],reverse=True)
    for i in comp_rank:
        if len(comp_score[i])==1:
            count+=1
            print("Rank ",str(count).ljust(2," ")," : ",comp_score[i][0].ljust(20,' '),"completeness score : %0.16f"%(i),"\t\tnumber of rows: ",no_of_rows[comp_score[i][0]])
        else:
            for j in l:
                if j in comp_score[i]:
                    count+=1
                    print("Rank ",str(count).ljust(2," ")," : ",j.ljust(20,' '),"completeness score : %0.16f"%(i),"\t\tnumber of rows: ",no_of_rows[j])
    print("\nRanking Complete!!\n")

In [18]:
# takes output tables schema(columns and data_types) as the input, compare it with input schema and transformations required and
# returns a list of all the transformations applicable
def get_possible_transformations(cols):
    return {col:[i for i in tran if i in transform_funct_list[input_schema[col]]] for col,tran in transformations.items() if col in cols}

In [19]:
# function returns a boolean result after checking if confition is satisfied
def condition_check(sign,v,value):
    x={">=":v>=value,"<=":v<=value,"<":v<value,">":v>value,"=":v==value,"!=":v!=value}
    return x[sign]

In [20]:
# function returns all stastical conditions that are satisfied as well as unsatisfied
def check_stats(res):
    cols=res.columns; satisfied={}; unsatisfied={}
    for col in stats:
        if col in cols:
            if input_schema[col]=="int64" or input_schema[col]=="float64":
                for condition in stats[col]:
                    sign=[i for i in signs if i in condition][0]
                    funct,value=condition.split(sign)[0],condition.split(sign)[1]
                    funct=funct.lower()
                    if funct in functs_onto:
                        pass
                    else:
                        for f in functs_onto:
                            if funct in functs_onto[f]:
                                funct=f
                    v=res[col].describe()[funct]
                    if condition_check(sign,v,int(value)):
                        s=condition+"( "+str(v)+sign+value+" )"
                        if col in satisfied:
                            satisfied[col].append(s)
                        else:
                            satisfied[col]=[]
                            satisfied[col].append(s)
                    else:
                        s=condition+"( "+funct+" = "+str(v)+" )"
                        if col in unsatisfied:
                            unsatisfied[col].append(s)
                        else:
                            unsatisfied[col]=[]
                            unsatisfied[col].append(s)
        else:
            unsatisfied[col+"(Column Not Present)"]=stats[col]
    return (satisfied,unsatisfied)

In [21]:
# this is a function to print the individual tables names, their nan score, columns that match with input schema, 
# coverage score, and completeness score along with possible transformations if any
def display_individual_matches(matching_tables,matching_tables_dict):
    global comp_score
    global no_of_rows
    f=open("output_folder_transformation/results.txt",'a')
    if transformations==-1:
        flag=0
    else:
        flag=1
    for i in matching_tables:
        print(i+"("+all_tablenames[i]+")",file=f)
        print(i+"("+all_tablenames[i]+")")
        res=pd.read_csv(i)
        res.rename(columns=matching_tables_dict[i],inplace=True)
        cov,comp=coverage_and_completeness(res)
        if flag==1:
            cols=res.columns
            l=get_possible_transformations(cols)
            print("possible transformations are : ",l,file=f)
            print("possible transformations are : ",l)
            cov,comp=coverage_and_completeness(res)
        if stats!=-1:
            sat,unsat=check_stats(res)
            if len(sat)>0:
                print("Conditions Satisfied : ",sat,file=f)
                print("Conditions Satisfied : ",sat)
            if len(unsat)>0:
                print("Conditions NOT SATISFIED : ",unsat,file=f)
                print("Conditions NOT SATISFIED : ",unsat)
        print('Missing Values(NANs score): ',nan_score(fname=i),file=f)
        print('Missing Values(NANs score): ',nan_score(fname=i))
        print("Columns that match with input_schema:\n "+i+' : ',matching_tables_dict[i],file=f)
        print("Columns that match with input_schema:\n "+i+' : ',matching_tables_dict[i])
        print("Coverage Score : ",cov,"\t Completeness Score : ",comp,file=f)
        print("Coverage Score : ",cov,"\t Completeness Score : ",comp)
        print(file=f)
        print()
        no_of_rows[i]=len(res)
        if comp in comp_score:
            comp_score[comp].append(i)
        else:
            comp_score[comp]=[]
            comp_score[comp].append(i)

In [70]:
# the main function that is to be invoked and will call all the required functions to obtain the required matches and merges
# we have a list with all possible tables that are matches
# we generate all combinations of them in order to merge them
# a valid combination is one where every pair of tables have 50% cosine column and category similarity
# then we merge them and calculate the coverage score, null score and the completeness score
# all these details are displayed for each of the valid merges as well as individual tables
# the output is displayed in the output file "results.txt" along with the outputs in csv form
def get_matches():
    global comp_score
    global no_of_rows
    comp_score={}
    no_of_rows={}
    if input_categories==-1:
        print('only schema')
        matching_tables_dict=check_possible_extended_matches()
    else:
        print('category and schema')
        matching_tables_dict=check_possible_category_included_matches()
    matching_tables=list(matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)
        print(file=f)
        print("All Possible Matches",file=f)
        print(file=f)
        print("Matching Tables : ",matching_tables,file=f)
        print("Matching Tables : ",matching_tables)
        for i in matching_tables:
            print(i,'\t',all_tablenames[i])
        if transformations==-1:
            print("\nNo Transformations In Input Schema",file=f)
            print(file=f)
        else:
            print('\nTransformations detected from input are : ',transformations,file=f)
            print(file=f)
    op_str1='output_folder_transformation/'
    op_str2='.csv'
    x=len(matching_tables)
    print()
    if x==0:
        with open("output_folder_transformation/results.txt",'a') as f:
            print("NO MATCHES FOUND",file=f)
            print(file=f)
    elif x==1:
        display_individual_matches(matching_tables,matching_tables_dict)
        print("Only one match found!! Rank 1 : ",matching_tables[0])
    elif x==2:
        a=matching_tables[0]+' : '+matching_tables[1]
        b=matching_tables[1]+' : '+matching_tables[0]
        if (a in cat_sim) or (b in cat_sim):
            if (cat_sim[a]>.50 and col_sim[a]>0.50) or (cat_sim[b]>.50 and col_sim[b]>0.50):
                res=merge_list_knowledge_graph([matching_tables[0],matching_tables[1]])
                op_string=op_str1+'1'+op_str2
                res.to_csv(op_string,sep=',', index=False)
                cols=res.columns
                cov,comp=coverage_and_completeness(res)
                with open("output_folder_transformation/results.txt",'a') as f:
                    print(op_string,file=f)
                    if transformations!=-1:
                        l=get_possible_transformations(cols)
                        print("possible transformations are : ",l,file=f)
                    if stats!=-1:
                        sat,unsat=check_stats(res)
                        if len(sat)>0:
                            print("Conditions Satisfied : ",sat,file=f)
                        if len(unsat)>0:
                            print("Conditions Not Satisfied : ",unsat,file=f)
                    print('Missing Values(NANs): ',nan_score(table=res),file=f)
                    print("Columns that match with input_schema: ",file=f)
                    for j in matching_tables:
                        print(j+' : ',matching_tables_dict[j],file=f)
                    print("Coverage Score : ",cov,"\t Completeness Score : ",comp,"\t Number of Rows : ",len(res),file=f)
                    print(file=f)
                    no_of_rows["output_"+str(ctr)+op_str2]=len(res)
                    if comp in comp_score:
                        comp_score[comp].append("output_"+str(ctr)+op_str2)
                    else:
                        comp_score[comp]=[]
                        comp_score[comp].append("output_"+str(ctr)+op_str2)
        display_individual_matches(matching_tables,matching_tables_dict)
    else:
        ctr=0
        count_comb=len(matching_tables)
        all_combos=generate_all_combinations(matching_tables)
        for i in range(count_comb,1,-1):
            for l in all_combos[i]:
                a=list(l)
                comb=list(combinations(a,2))
                flag=0
                for pair in comb:
                    if flag==0:
                        t1,t2=pair
                        if ((t1+' : '+t2) in cat_sim):
                            if (cat_sim[t1+' : '+t2]>=.50 and col_sim[t1+' : '+t2]>=0.50):
                                pass
                            else:
                                flag=1
                        else:
                            flag=1
                    else:
                        break
                if flag==0:
                    res=merge_list_knowledge_graph(a)
                    if type(res) != type(-1):
                        ctr+=1
                        op_string=op_str1+str(ctr)+op_str2
                        res.to_csv(op_string,sep=',', index=False)
                        cols=res.columns
                        cov,comp=coverage_and_completeness(res)
                        with open("output_folder_transformation/results.txt",'a') as f:
                            print(str(ctr)+op_str2+' : ',end='',file=f)
                            print(str(ctr)+op_str2+' : ',end='')
                            for j in a:
                                print(j+'\t\t',end='',file=f)
                                print(j+'\t\t',end='')
                            print(file=f)
                            print()
                            if transformations!=-1:
                                l=get_possible_transformations(cols)
                                print("possible transformations are : ",l,file=f)
                                print("possible transformations are : ",l)
                            if stats!=-1:
                                sat,unsat=check_stats(res)
                                if len(sat)>0:
                                    print("Conditions Satisfied : ",sat,file=f)
                                    print("Conditions Satisfied : ",sat)
                                if len(unsat)>0:
                                    print("Conditions Not Satisfied : ",unsat,file=f)
                                    print("Conditions Not Satisfied : ",unsat)
                            print('Missing Values(NANs): ',nan_score(table=res),file=f)
                            print('Missing Values(NANs): ',nan_score(table=res))
                            print("Columns that match with input_schema: ",file=f)
                            print("Columns that match with input_schema: ")
                            for j in a:
                                print(j+' : ',matching_tables_dict[j],file=f)
                                print(j+' : ',matching_tables_dict[j])
                            print("Coverage Score : ",cov,"\t Completeness Score : ",comp,file=f)
                            print("Coverage Score : ",cov,"\t Completeness Score : ",comp)
                            print()
                            print(file=f)
                            no_of_rows["output_"+str(ctr)+op_str2]=len(res)
                            if comp in comp_score:
                                comp_score[comp].append("output_"+str(ctr)+op_str2)
                            else:
                                comp_score[comp]=[]
                                comp_score[comp].append("output_"+str(ctr)+op_str2)
        display_individual_matches(matching_tables,matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)
    ranking_display(comp_score,no_of_rows)

In [46]:
s='{"schema": {"round": "int64", "round 1": "object", "circuit": "object", "day": "object", "pole position": "object", "fastest lap": "object", "driver": "object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
direct_matches_only()
ranking_display(comp_score,no_of_rows)

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'day': 'object', 'pole position': 'object', 'fastest lap': 'object', 'driver': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

Matching Tables 
203_514.csv  :  2008 Superleague Formula season
204_253.csv  :  1990 Superbike World Championship season
204_40.csv  :  2008 Superbike World Championship season
204_569.csv  :  1998 Swedish Touring Car Championship season
204_845.csv  :  2003 Barber Dodge Pro Series season


203_514.csv(2008 Superleague Formula season)
Missing Values(NANs score):  {'round': '0/12', 'round 1': '0/12', 'race': '0/12', 'date': '0/12', 'pole position': '6/12', 'fastest lap': '0/12', 'winning club': '0/12', 'winning team': '0/12', 'report': '0/12'}
Columns that match with input_schema:
 203_514.csv :  {'round': 'round', 'round 1': 'round 1', 'pole position': 'pole position', 'fastest lap': 'fastest lap'}
Coverage Score :  0.57142857142

In [47]:
s='{"schema": {"round": "int64", "round 1": "object", "circuit": "object", "day": "object", "pole position": "object", "fastest lap": "object", "driver": "object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
check_possible_extended_matches()
ranking_display(comp_score,no_of_rows)

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'day': 'object', 'pole position': 'object', 'fastest lap': 'object', 'driver': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

Matching Tables
203_181.csv  :  1990 IndyCar season
203_408.csv  :  1989 Formula One season
203_514.csv  :  2008 Superleague Formula season
203_742.csv  :  1995 IndyCar season
204_253.csv  :  1990 Superbike World Championship season
204_40.csv  :  2008 Superbike World Championship season
204_455.csv  :  1989 Formula One season
204_569.csv  :  1998 Swedish Touring Car Championship season
204_63.csv  :  2002 Italian Formula Three season
204_845.csv  :  2003 Barber Dodge Pro Series season


203_181.csv(1990 IndyCar season)
Missing Values(NANs score):  {'date': '0/17', 'rnd': '0/17', 'race name': '0/17', 'circuit': '0/17', 'city/location': '0/17', 'pole position': '0/17', 'winning driver': '0/17', 'winning team': '0/17', 'report': '0/

In [48]:
s='{"schema": {"round": "int64", "round 1": "object", "circuit": "object", "day": "object", "pole position": "object", "fastest lap": "object", "driver": "object"},"categories":["motorsport", "car", "seasons"]}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
check_possible_category_included_matches()
ranking_display(comp_score,no_of_rows)

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'day': 'object', 'pole position': 'object', 'fastest lap': 'object', 'driver': 'object'}
Input Categories :  ['motorsport', 'car', 'seasons']
Input Transformations : None
Input Condition based conditions : None

Matching Tables
203_181.csv  :  1990 IndyCar season
203_514.csv  :  2008 Superleague Formula season
203_742.csv  :  1995 IndyCar season
204_253.csv  :  1990 Superbike World Championship season
204_569.csv  :  1998 Swedish Touring Car Championship season
204_63.csv  :  2002 Italian Formula Three season


203_181.csv(1990 IndyCar season)
Missing Values(NANs score):  {'date': '0/17', 'rnd': '0/17', 'race name': '0/17', 'circuit': '0/17', 'city/location': '0/17', 'pole position': '0/17', 'winning driver': '0/17', 'winning team': '0/17', 'report': '0/17'}
Columns that match with input_schema:
 203_181.csv :  {'circuit': 'circuit', 'pole position': 'pole position', 'date': 'day', 'winning driver': 'driver'}


In [59]:
def merge_list(l):
    t1=pd.read_csv(l[0])
    a=all_schemas[l[0]]
    matching_columns,c=generate_matching_columns(a,l[1])
    t2=pd.read_csv(l[1])
    t2.rename(columns = matching_columns,inplace=True)
    try:
        res=t1.merge(t2,how='outer')
    except:
        return -1
    for fname in l[2:]:
        res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
        matching_columns,c=generate_matching_columns(res_cols,fname)
        t=pd.read_csv(fname)
        t.rename(columns = matching_columns,inplace=True)
        try:
            res=res.merge(t,how='outer')
        except:
            return -1
    res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
    a=input_schema
    cols1={col1:col1 for col1,d_type1 in res_cols.items() if (col1 in a) and d_type1==input_schema[col1]}
    a={k:v for k,v in a.items() if k not in cols1}
    a_onto=generate_ontology(a)
    cols2={col1:col for col1,d_type1 in res_cols.items() for col in a_onto if col1 in a_onto[col] and (d_type1==a[col])}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[x[0]]=col1
    res.rename(columns = cols,inplace=True)
    for col in res.columns:
        if col not in input_schema and (res[col].count()<(0.70*len(res))):
            res.drop(columns=[col],inplace=True)
    return res

In [68]:
# schema, transformations and content based conditions
s='{"schema": {"nationality": "object", "tonnage (grt)": "int64", "fate": "object","ship":"object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
get_matches()

Input Schema :  {'nationality': 'object', 'tonnage (grt)': 'int64', 'fate': 'object', 'ship': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema
Matching Tables
202_117.csv  :  German submarine U-559
203_148.csv  :  German submarine U-9 (1935)
203_268.csv  :  German submarine U-502
204_100.csv  :  Hans-Rudolf Rosing


202_117.csv(German submarine U-559)
Missing Values(NANs score):  {'date': '0/6', 'ship': '0/6', 'nationality': '0/6', 'tonnage': '0/6', 'fate': '0/6'}
Columns that match with input_schema:
 202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
Coverage Score :  1.0 	 Completeness Score :  1.0

203_148.csv(German submarine U-9 (1935))
Missing Values(NANs score):  {'date': '0/9', 'name': '0/9', 'nationality': '0/9', 'tonnage (grt)': '0/9', 'fate': '0/9'}
Columns that match with input_schema:
 203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'ton

In [64]:
pd.read_csv("204_806.csv")

Unnamed: 0,name,date,condition,ownership / access,notes
0,Château d'Angers,13th century,Substantially intact,City of Angers,"Walls nearly 2,000 feet (610 m) in circumferen..."
1,Château de Bauge,15th century,Intact,,Built as hunting lodge.
2,Château de Bourmont,16-19th century,Intact and extended,Private,Extended in Neo-Gothic style
3,Château de Brissac,15-17th century,Rebuilt,Private,"Damaged during French Wars of Religion, rebuil..."
4,Château de Champtoce,13-16th century,Ruins,,
5,Château de Montreuil-Bellay,11-13th century,Substantially intact,Private,"Divided into rental units in 1822, restored af..."
6,Château de Montsoreau,1455,Restored,Departement,"Ruinous by late 19th century, restored, houses..."
7,Château du Plessis-Bourre,1468-1472,Intact,Private (open to the public),"Externally unchanged since the 15th century, w..."
8,Château du Plessis-Mace,13-16th century,Intact,,Converted to house 15th century.
9,Château de Pouance,12-13th century,Ruins,,"Considered second fortress of Anjou, after Ang..."


In [71]:
# schema, transformations and content based conditions
s='{"schema": {"nationality": "object", "tonnage (grt)": "int64", "fate": "object","ship":"object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
get_matches()

Input Schema :  {'nationality': 'object', 'tonnage (grt)': 'int64', 'fate': 'object', 'ship': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema
Matching Tables
202_117.csv  :  German submarine U-559
203_148.csv  :  German submarine U-9 (1935)
203_268.csv  :  German submarine U-502
204_100.csv  :  Hans-Rudolf Rosing


202_117.csv(German submarine U-559)
Missing Values(NANs score):  {'date': '0/6', 'ship': '0/6', 'nationality': '0/6', 'tonnage': '0/6', 'fate': '0/6'}
Columns that match with input_schema:
 202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
Coverage Score :  1.0 	 Completeness Score :  1.0

203_148.csv(German submarine U-9 (1935))
Missing Values(NANs score):  {'date': '0/9', 'name': '0/9', 'nationality': '0/9', 'tonnage (grt)': '0/9', 'fate': '0/9'}
Columns that match with input_schema:
 203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'ton