In [1]:
import json
import math
import pandas as pd
import numpy as np
import urllib
from itertools import combinations
from nltk.corpus import wordnet
from IPython.display import display

In [2]:
#preloading data such as schemas, categories and cos similarity values which we have ready for our dataset

#preloading all schemas and categories
all_schemas={}
all_categories={}
all_tablenames={}
with open("final_schema.txt") as ip_file:
    for line in ip_file.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]=json_obj["schema"]
        all_categories[json_obj["filename"]]=json_obj["categories"]
        all_tablenames[json_obj["filename"]]=json_obj["tablename"]
        
#preloading the candidate keys
with open("Candidate_key_dict.txt",'r') as ip_file:
    cand_key=json.load(ip_file)

#preloading column and category similarity values of tables
with open("cos_similarity.txt",'r') as ip_file:
    json_object=json.load(ip_file)
    col_sim = json_object["column_similarity"]
    cat_sim = json_object["category_similarity"]

In [3]:
# all functions needed to generate ontologies
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)

# takes input list and returns ontology as dictionary with every word in list as the key
def generate_ontology(list1):
    ontology={}
    for word in list1:
        ontology[word]=get_synonyms(word)
    return ontology

In [4]:
x={'date': 'object', 'nationality': 'object', 'tonnage': 'int64', 'fate': 'object'}
y=['indian','diabetes','medical','association']
print(x,"\n","\n",generate_ontology(x),"\n")
print(y,"\n","\n",generate_ontology(y))

{'date': 'object', 'nationality': 'object', 'tonnage': 'int64', 'fate': 'object'} 
 
 {'date': ['meeting', 'particular_date', 'date', 'calendar_month', 'calendar_year', 'calendar_day', 'appointment', 'companion', 'point', 'edible_fruit', 'day_of_the_month', 'escort', 'day', 'present', 'engagement'], 'nationality': ['people', 'nationality', 'status'], 'tonnage': ['tunnage', 'tonnage_duty', 'duty', 'tonnage'], 'fate': ['destiny', 'luck', 'causal_agent', 'portion', 'fortune', 'condition', 'circumstances', 'fate', 'lot', 'happening']} 

['indian', 'diabetes', 'medical', 'association'] 
 
 {'indian': ['Red_Indian', 'Indian', 'Amerindian', 'Amerind', 'Asian', 'natural_language', 'American_Indian', 'indian', 'American-Indian_language', 'Amerindian_language'], 'diabetes': ['diabetes', 'polygenic_disorder'], 'medical': ['medical', 'medical_checkup', 'medical_examination', 'health_check', 'medical_exam', 'checkup', 'examination'], 'association': ['connexion', 'relationship', 'memory', 'tie-up', 

In [5]:
# making a list of transformation functions
tf_onto={}
transform_funct_list={"int64":['average','sum','maximum','minimum','range','median','variance','standard deviation','mode','frequency','avg'],"float64":['average','sum','maximum','minimum','range','median','variance','standard deviation','mode','frequency','avg'],"object":["funct1","funct2","funct3"]}
for dtype,funct_list in transform_funct_list.items():
    tf_onto[dtype]=generate_ontology(funct_list)
# initialising knowledge graph data dictionary
kg_data={}
# making a list of all statistical functions and the various signs and combinations 
signs = ["<=",">=","<",">","=","!="]; functs =  ['count','mean','standard deviation','min','max','25%','50%','75%']; functs_onto = generate_ontology(functs)
functs_onto['25%'].append("first quartile");functs_onto['75%'].append("third quartile");functs_onto['50%'].append("second quartile");functs_onto['min'].append("minimum");functs_onto['max'].append("maximum")

In [6]:
#getting the input data: 1)schema 2)categories 3)transformations and 4)statistical functions requested

#import the input schema
with open("input_tranformations.txt",'r') as ip_file:
        ip_schema=json.load(ip_file)
input_schema=ip_schema["schema"]
#categories is an optional input, so it may or ay not be there
#if present it gives us more accurate otput matches
if "categories" in ip_schema:
    input_categories=ip_schema["categories"]
else:
    input_categories=-1
#transformations is an optional input
#if present we must determine if it is applicable on the column required by the user
if "transformations" in ip_schema:
    transformations=ip_schema["transformations"]
else:
    transformations=-1
# statistical conditions are an optional input
# if present we must determine which conditions are satisfied or not
if "stats" in ip_schema:
    stats=ip_schema["stats"]
else:
    stats=-1

In [7]:
# to generate cos similarity between two lists
def cos_sim(list1, list2):
    terms = set(list1).union(list2)
    intersect = set(list1) & set(list2)
    others = (set(list1)-intersect).union(set(list2)-intersect)
    product=0
    for word in terms:
        if word in intersect:
            product+=1
    l1mag = math.sqrt(len(list1))
    l2mag = math.sqrt(len(list2))
    if len(list1)==0 or len(list2)==0:
        return 0.0
    else:
        return product / (l1mag * l2mag)

In [8]:
a=["fate","tonnage","cost","boat","location","nationality"]
b=["boat","location","nationality","name","port","captian"]
print("a : ",a)
print("b : ",b)
print("cosine similarity score between a and b : ",cos_sim(a,b))

a :  ['fate', 'tonnage', 'cost', 'boat', 'location', 'nationality']
b :  ['boat', 'location', 'nationality', 'name', 'port', 'captian']
cosine similarity score between a and b :  0.5000000000000001


In [9]:
# if input has only schema(columns and their dataypes)
# it returns a list of all tables with one or more columns that match with input schema(or its ontology)
def col_only_list(input_schema,input_sch_onto):
    possible_tables={}
    for file in all_schemas:
        schema=all_schemas[file]
        cols1={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
        schema={k:v for k,v in schema.items() if k not in list(cols1.values())}
        cols2={col1:col for col1,d_type1 in schema.items() for col in input_sch_onto if col1 in input_sch_onto[col] and d_type1==input_schema[col]}
        schema={k:v for k,v in schema.items() if k not in cols1}
        cols={**cols1,**cols2}
        ip_schema={k:v for k,v in input_schema.items() if k not in list(cols.values())}
        for col1,d_type1 in ip_schema.items():
            x=[col2 for col2,d_type2 in schema.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
            if len(x)==1:
                cols[x[0]]=col1
        if(len(cols)>1):
            possible_tables[file]=cols       
    return possible_tables

# if input has categories as well as schema(columns and their dataypes)
# we consider it a match under the assumption that at least 75% category match exists
# then all tables that meet this criteria and have one or more columns that match are retured as a list 
def cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto):
    possible_tables={}
    for file in all_categories:
        category=all_categories[file]
        cats1=[cat1 for cat1 in category if cat1 in input_categories]
        category=[k for k in category if k not in cats1]
        cats2=[cat2 for cat1 in category for cat2 in input_cat_onto if cat1 in input_cat_onto[cat2]]
        cat_list=cats1+cats2
        cos_val=cos_sim(cat_list,input_categories)
        if cos_val > 0.75 :
            schema=all_schemas[file]
            cols1={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
            schema={k:v for k,v in schema.items() if k not in list(cols1.values())}
            cols2={col1:col for col1,d_type1 in schema.items() for col in input_sch_onto if col1 in input_sch_onto[col] and d_type1==input_schema[col]}
            schema={k:v for k,v in schema.items() if k not in list(cols1.values())}
            cols={**cols1,**cols2}
            ip_schema={k:v for k,v in input_schema.items() if k not in list(cols.values())}
            for col1,d_type1 in ip_schema.items():
                x=[col2 for col2,d_type2 in schema.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
                if len(x)==1:
                    cols[col1]=x[0]
            if(len(cols)>1):
                possible_tables[file]=cols
    return possible_tables

In [10]:
input_schema={'date': 'object', 'nationality': 'object', 'tonnage (grt)': 'int64', 'fate': 'object'}
input_sch_onto=generate_ontology(input_schema)
print("Input has schema only. All possible tables with one or more columns matching : ")
for i,j in col_only_list(input_schema,input_sch_onto).items():
    print(i," : ",j)
input_categories=["submarines","germany","shipwrecks","navy","uboat"]
input_cat_onto=generate_ontology(input_categories)
print("\nInput has schema and categories. All possible tables with 75% category match and one or more column match : ")
for i,j in cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto).items():
    print(i," : ",j)

Input has schema only. All possible tables with one or more columns matching : 
202_117.csv  :  {'date': 'date', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
203_102.csv  :  {'date': 'date', 'nation': 'nationality'}
203_148.csv  :  {'date': 'date', 'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
203_268.csv  :  {'date': 'date', 'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
203_433.csv  :  {'nationality': 'nationality', 'date': 'date'}
203_481.csv  :  {'nationality': 'nationality', 'date of birth': 'date'}
203_706.csv  :  {'status': 'nationality', 'dates': 'date'}
203_760.csv  :  {'status': 'nationality', 'appointment': 'date'}
204_100.csv  :  {'date': 'date', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
204_151.csv  :  {'date': 'date', 'day': 'date'}
204_272.csv  :  {'date': 'date', 'nationality': 'nationality'}
204_434.csv  :  {'date': 'date', 'status': 'national

In [11]:
# function generates all possible combinations of list l taking elements n to 2 at a time and returns a dictionary
def generate_all_combinations(l):
    x={}
    a=len(l)
    for i in range(a,1,-1):
        x[i]=list(combinations(l,i))
    return x

In [12]:
generate_all_combinations(['202_117.csv', '203_148.csv', '203_268.csv', '204_100.csv'])

{4: [('202_117.csv', '203_148.csv', '203_268.csv', '204_100.csv')],
 3: [('202_117.csv', '203_148.csv', '203_268.csv'),
  ('202_117.csv', '203_148.csv', '204_100.csv'),
  ('202_117.csv', '203_268.csv', '204_100.csv'),
  ('203_148.csv', '203_268.csv', '204_100.csv')],
 2: [('202_117.csv', '203_148.csv'),
  ('202_117.csv', '203_268.csv'),
  ('202_117.csv', '204_100.csv'),
  ('203_148.csv', '203_268.csv'),
  ('203_148.csv', '204_100.csv'),
  ('203_268.csv', '204_100.csv')]}

In [13]:
# for input table and column, get a list of all categories that the terms(elements) in that column belong to
def extract_info_from_knowledge_graphs(a,col1):
    with open("C:\\Users\\adith\\Desktop\\my_google_knowledge_graph_api_key.txt","r") as f:
        api_key=f.readline()
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {'limit': 50,'indent': True,'key': api_key}
    s={}
    ctr=0
    for row in a[col1]:
        if type(row)==type(np.nan):
            ctr+=1
            continue
        params['query']=row
        url = service_url + '?' + urllib.parse.urlencode(params)
        try:
            response = json.loads(urllib.request.urlopen(url).read())
        except urllib.error.HTTPError as httperr:
            try:
                response = json.loads(urllib.request.urlopen(url).read())
            except urllib.error.HTTPError as httperr:
                try:
                    response = json.loads(urllib.request.urlopen(url).read())
                except urllib.error.HTTPError as httperr:
                    ctr+=1
        for element in response['itemListElement']:
            for i in element["result"]["@type"]:
                b=i.lower()
                if b in s:
                    s[b]+=1
                else:
                    s[b]=1
            if "description" in element["result"]:
                b=element["result"]["description"].lower()
                if b in s:
                    s[b]+=1
                else:
                    s[b]=1
    return s,ctr

In [14]:
a=pd.read_csv("203_148.csv")
s,ctr=extract_info_from_knowledge_graphs(a,"name")
print(s)

{'thing': 292, 'organization': 18, 'sportsteam': 13, 'cycling team': 1, 'soccer club': 5, 'book': 3, 'book by antoon sanders': 1, 'basketball club': 1, 'corporation': 8, 'record label': 1, 'soccer team': 1, 'ship': 13, 'place': 24, 'professional sports team': 1, 'person': 118, "dulce of aragon's son": 1, 'event': 13, 'philosopher': 1, 'argentine goalkeeper': 1, 'soccer forward': 1, 'soccer player': 3, 'road cycling team': 3, 'argentine soccer player': 3, 'plants': 1, 'city': 9, 'town in argentina': 1, 'argentinean soccer player': 1, 'sportsorganization': 1, 'league': 1, 'soccer goalkeeper': 1, 'aerospace company': 1, 'country': 2, 'administrativearea': 4, 'country in central africa': 1, 'company': 2, 'bank': 1, 'musiccomposition': 1, 'opera by claudio monteverdi': 1, 'city in chile': 1, 'disaster': 1, 'stadiumorarena': 2, 'arena in hämeenlinna, finland': 1, 'tvseries': 3, 'brazilian telenovela': 2, 'military conflict': 1, 'movie': 4, '1917 film': 1, 'revolutionary': 1, 'song by franco 

In [15]:
# using data from google knowledge graphs, try to obtain alternate names for columns that do not match to merge them if they
# represent the same entity
# returned values are in the form (final_name,old_name)
def get_alternate_col_name(a,b,col1,col2):
    global kg_data
    if col1 not in kg_data:
        groups,count=extract_info_from_knowledge_graphs(a,col1)
        kg_data[col1]={"groups":groups,"count":count}
    if col2 not in kg_data:
        groups,count=extract_info_from_knowledge_graphs(b,col2)
        kg_data[col2]={"groups":groups,"count":count}
    s1=kg_data[col1]["groups"]
    ctr1=kg_data[col1]["count"]
    s2=kg_data[col2]["groups"]
    ctr2=kg_data[col2]["count"]
    if (col1 in s1) and (col1 in s2):
        if (s1[col1]>=len(a)-1-ctr1) and (s2[col1]>=len(b)-1-ctr2):
            kg_data[col1]["groups"][col1]+=kg_data[col2]["groups"][col1]
            kg_data[col1]["count"]+=kg_data[col2]["count"]
            del kg_data[col2]
            return (col1,col2)
    elif (col2 in s1) and (col2 in s2):
        if (s1[col2]>=len(a)-1-ctr1) and (s2[col2]>=len(b)-1-ctr2):
            kg_data[col2]["groups"][col2]+=kg_data[col1]["groups"][col2]
            kg_data[col2]["count"]+=kg_data[col1]["count"]
            del kg_data[col1]
            return (col2,col1)
    else:
        return 0,0

In [16]:
a=pd.read_csv("203_148.csv")
b=pd.read_csv("202_117.csv")
x=get_alternate_col_name(a,b,"name","ship")
print(x)

('ship', 'name')


In [17]:
# using google knowledge graph update the column names and update tables (if possible) to improve merge accuracy and improve completeness
def graph_match(c,t1,t2):
    l=list(combinations(c,2))
    possible_column_renames={}
    flag=0
    for i in l:
        if ((i[0] in t1) and (i[1] in t2)):
            a,b=get_alternate_col_name(t1,t2,i[0],i[1])
        elif ((i[1] in t1) and (i[0] in t2)):
            a,b=get_alternate_col_name(t1,t2,i[1],i[0])
        else:
            a,b=0,0
        if a!=0 and b!=0:
            if b in input_schema:
                if b in t1:
                    t2.rename(columns={a:b},inplace=True)
                else:
                    t1.rename(columns={a:b},inplace=True)
            else:
                if a in t1:
                    t2.rename(columns={b:a},inplace=True)
                else:
                    t1.rename(columns={b:a},inplace=True)
            c.remove(a)
            c.remove(b)
            return t1,t2,c
    return t1,t2,0

In [18]:
def gen_kg_data(c,a,res):
    global kg_data
    for col in c:
        if col in a:
            b=a
        else:
            b=res
        if col not in kg_data:
            groups,ctr=extract_info_from_knowledge_graphs(b,col)
            kg_data[col]={"groups":groups,"count":ctr}

In [19]:
# function generates all matching columns between the res_cols schema and columns of table in fname
def generate_matching_columns(res_cols,fname):
    a=all_schemas[fname]
    c={**res_cols,**a}
    res_onto=generate_ontology(res_cols)
    cols1={col1:col1 for col1,d_type1 in a.items() if (col1 in res_cols) and d_type1==res_cols[col1]}
    res_onto={k:v for k,v in res_onto.items() if k not in cols1}
    a={k:v for k,v in a.items() if k not in cols1}
    cols2={col1:cols for col1,d_type1 in a.items() for col in res_onto if col1 in res_onto[col] and d_type1==res_cols[col]}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[col1]=x[0]
    matching_columns=cols
    c={k:v for k,v in c.items() if k not in matching_columns and v=='object'}
    return (matching_columns,list(c))

In [None]:
# returns a merged table of all tables given in input list l    
def merge_list(l):
    t1=pd.read_csv(l[0])
    a=all_schemas[l[0]]
    matching_columns,c=generate_matching_columns(a,l[1])
    t2=pd.read_csv(l[1])
    gen_kg_data(c,t1,t2)
    if len(c)>=2:
        t1,t2,c=graph_match(c,t1,t2)
        while c!=0:
            t1,t2,c=graph_match(c,t1,t2)
    t2.rename(columns = matching_columns,inplace=True)
    try:
        res=t1.merge(t2,how='outer')
    except:
        return -1
    for fname in l[2:]:
        res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
        matching_columns,c=generate_matching_columns(res_cols,fname)
        t=pd.read_csv(fname)
        gen_kg_data(c,res,t)
        if len(c)>=2:
            res,t,c=graph_match(c,res,t)
            while c!=0:
                res,t,c=graph_match(c,res,t)
        t.rename(columns = matching_columns,inplace=True)
        try:
            res=res.merge(t,how='outer')
        except:
            return -1
    res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
    a=input_schema
    cols1={col1:col1 for col1,d_type1 in res_cols.items() if (col1 in a) and d_type1==input_schema[col1]}
    a={k:v for k,v in a.items() if k not in cols1}
    a_onto=generate_ontology(a)
    cols2={col1:col for col1,d_type1 in res_cols.items() for col in a_onto if col1 in a_onto[col] and (d_type1==a[col])}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[x[0]]=col1
    res.rename(columns = cols,inplace=True)
    return res

In [None]:
# metrics calculated :
# 1)nan_score(number of nulls in each column)
# 2)coverage_score(no of matching columns with input schema/total number of columns in input schema)
# 3)completeness_score(a combination of coverage and nan scores to determine how complete the result dataset is)

# nan score = {x : (no on nans in column/no of entries in column)} where x is each column in the table
# gives the nan score(no on nans/no of entries in table) for each column in the input table
def nan_score(table=-1,fname=-1):
    if fname!=-1:
        table=pd.read_csv(fname)
    nan_count={}
    a=len(table)
    for i in table.columns:
        x=a-table[i].count()
        s=str(x)+'/'+str(a)
        nan_count[i]=s
    return nan_count

#returns the coverage score and completeness score of a given table
#coverage score is calculated as : 
# coverage = (no of columns matching with input schema/total number of columns in input schema)
#completeness score is calculated as : 
# completeness = (sum(x*(non null entries)/(total entries in the column))/total number of columns in input schema) 
#  where x=1 if column present in input schema and x=0 if column is not present in the input schema
def coverage_and_completeness(table):
    ctr=0
    comp=0.0
    cols=table.columns
    l=len(table)
    for col in cols:
        if col in input_schema:
            ctr+=1
            comp+=((l-sum(pd.isnull(table[col])))/l)
    comp=comp/len(input_schema)
    cov=ctr/len(input_schema)
    return (cov,comp)

In [None]:
def ranking_display(comp_score,no_of_rows):
    print()
    comp_rank=sorted(comp_score,reverse=True)
    count=0
    l=sorted(no_of_rows, key=lambda k: no_of_rows[k],reverse=True)
    for i in comp_rank:
        if len(comp_score[i])==1:
            count+=1
            print("Rank ",str(count).ljust(2," ")," : ",comp_score[i][0].ljust(20,' '),"completeness score : %0.16f"%(i),"\t\tnumber of rows: ",no_of_rows[comp_score[i][0]])
        else:
            for j in l:
                if j in comp_score[i]:
                    count+=1
                    print("Rank ",str(count).ljust(2," ")," : ",j.ljust(20,' '),"completeness score : %0.16f"%(i),"\t\tnumber of rows: ",no_of_rows[j])
    print("\nRanking Complete!!\n")

In [None]:
# takes output tables schema(columns and data_types) as the input, compare it with input schema and transformations required and
# returns a list of all the transformations applicable
def get_possible_transformations(cols):
    return {col:[i for i in tran if i in transform_funct_list[input_schema[col]]] for col,tran in transformations.items() if col in cols}

In [None]:
# function returns a boolean result after checking if confition is satisfied
def condition_check(sign,v,value):
    x={">=":v>=value,"<=":v<=value,"<":v<value,">":v>value,"=":v==value,"!=":v!=value}
    return x[sign]

In [None]:
# function returns all stastical conditions that are satisfied as well as unsatisfied
def check_stats(res):
    cols=res.columns; satisfied={}; unsatisfied={}
    for col in stats:
        if col in cols:
            if input_schema[col]=="int64" or input_schema[col]=="float64":
                for condition in stats[col]:
                    sign=[i for i in signs if i in condition][0]
                    funct,value=condition.split(sign)[0],condition.split(sign)[1]
                    funct=funct.lower()
                    if funct in functs_onto:
                        pass
                    else:
                        for f in functs_onto:
                            if funct in functs_onto[f]:
                                funct=f
                    v=res[col].describe()[funct]
                    if condition_check(sign,v,int(value)):
                        s=condition+"( "+str(v)+sign+value+" )"
                        if col in satisfied:
                            satisfied[col].append(s)
                        else:
                            satisfied[col]=[]
                            satisfied[col].append(s)
                    else:
                        s=condition+"( "+funct+" = "+str(v)+" )"
                        if col in unsatisfied:
                            unsatisfied[col].append(s)
                        else:
                            unsatisfied[col]=[]
                            unsatisfied[col].append(s)
        else:
            unsatisfied[col+"(Column Not Present)"]=stats[col]
    return (satisfied,unsatisfied)

In [None]:
# this is a function to print the individual tables names, their nan score, columns that match with input schema, 
# coverage score, and completeness score along with possible transformations if any
def display_individual_matches(matching_tables,matching_tables_dict):
    global comp_score
    global no_of_rows
    f=open("output_folder_transformation/results.txt",'a')
    if transformations==-1:
        flag=0
    else:
        flag=1
    for i in matching_tables:
        print(i+"("+all_tablenames[i]+")",file=f)
        print(i+"("+all_tablenames[i]+")")
        res=pd.read_csv(i)
        res.rename(columns=matching_tables_dict[i],inplace=True)
        cov,comp=coverage_and_completeness(res)
        if flag==1:
            cols=res.columns
            l=get_possible_transformations(cols)
            print("possible transformations are : ",l,file=f)
            print("possible transformations are : ",l)
            cov,comp=coverage_and_completeness(res)
        if stats!=-1:
            sat,unsat=check_stats(res)
            if len(sat)>0:
                print("Conditions Satisfied : ",sat,file=f)
                print("Conditions Satisfied : ",sat)
            if len(unsat)>0:
                print("Conditions NOT SATISFIED : ",unsat,file=f)
                print("Conditions NOT SATISFIED : ",unsat)
        print('Missing Values(NANs score): ',nan_score(fname=i),file=f)
        print('Missing Values(NANs score): ',nan_score(fname=i))
        print("Columns that match with input_schema:\n "+i+' : ',matching_tables_dict[i],file=f)
        print("Columns that match with input_schema:\n "+i+' : ',matching_tables_dict[i])
        print("Coverage Score : ",cov,"\t Completeness Score : ",comp,file=f)
        print("Coverage Score : ",cov,"\t Completeness Score : ",comp)
        print(file=f)
        print()
        no_of_rows[i]=len(res)
        if comp in comp_score:
            comp_score[comp].append(i)
        else:
            comp_score[comp]=[]
            comp_score[comp].append(i)

In [None]:
# this functions calls the col_only_list() or cat_and_col_list() based on input query requirements
# it recieves a list of tbales with one or more columns matching with input schema and if categories involved then 75% category cos similarity with input schema
# then we check for 75% column cosine cimilarity score and return a list of all tables having 75% column cosine similarity score
def check_possible_matches():
    input_sch_onto=generate_ontology(input_schema)
    if input_categories==-1:
        print('only schema')
        possible_tables=col_only_list(input_schema,input_sch_onto)
    else:
        print('category and schema')
        input_cat_onto=generate_ontology(input_categories)
        possible_tables=cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto)
    matching_tables={}
#     print("Columns cosine similarity values that are greater than 75% :")
#     print("File Name\t\tcolumn cos similarity value")
    for i in possible_tables:
        cos_val=cos_sim(list(possible_tables[i].values()),list(input_schema))
        if cos_val>0.75:
#             print(i,"\t\t",cos_val)
            matching_tables[i]=possible_tables[i]
    print()
    return matching_tables

In [None]:
# the main function that is to be invoked and will call all the required functions to obtain the required matches and merges
# we have a list with all possible tables that are matches
# we generate all combinations of them in order to merge them
# a valid combination is one where every pair of tables have 50% cosine column and category similarity
# then we merge them and calculate the coverage score, null score and the completeness score
# all these details are displayed for each of the valid merges as well as individual tables
# the output is displayed in the output file "results.txt" along with the outputs in csv form
def get_matches():
    global comp_score
    global no_of_rows
    comp_score={}
    no_of_rows={}
    matching_tables_dict=check_possible_matches()
    matching_tables=list(matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)
        print(file=f)
        print("All Possible Matches",file=f)
        print(file=f)
        print("Matching Tables : ",matching_tables,file=f)
        print("Matching Tables : ",matching_tables)
        for i in matching_tables:
            print(i,'\t',all_tablenames[i])
        if transformations==-1:
            print("\nNo Transformations In Input Schema",file=f)
            print(file=f)
        else:
            print('\nTransformations detected from input are : ',transformations,file=f)
            print(file=f)
    op_str1='output_folder_transformation/'
    op_str2='.csv'
    x=len(matching_tables)
    print()
    if x==0:
        with open("output_folder_transformation/results.txt",'a') as f:
            print("NO MATCHES FOUND",file=f)
            print(file=f)
    elif x==1:
        display_individual_matches(matching_tables,matching_tables_dict)
        print("Only one match found!! Rank 1 : ",matching_tables[0])
    elif x==2:
        a=matching_tables[0]+' : '+matching_tables[1]
        b=matching_tables[1]+' : '+matching_tables[0]
        if (a in cat_sim) or (b in cat_sim):
            if (cat_sim[a]>.50 and col_sim[a]>0.50) or (cat_sim[b]>.50 and col_sim[b]>0.50):
                res=merge_list([matching_tables[0],matching_tables[1]])
                op_string=op_str1+'1'+op_str2
                res.to_csv(op_string,sep=',', index=False)
                cols=res.columns
                cov,comp=coverage_and_completeness(res)
                with open("output_folder_transformation/results.txt",'a') as f:
                    print(op_string,file=f)
                    if transformations!=-1:
                        l=get_possible_transformations(cols)
                        print("possible transformations are : ",l,file=f)
                    if stats!=-1:
                        sat,unsat=check_stats(res)
                        if len(sat)>0:
                            print("Conditions Satisfied : ",sat,file=f)
                        if len(unsat)>0:
                            print("Conditions Not Satisfied : ",unsat,file=f)
                    print('Missing Values(NANs): ',nan_score(table=res),file=f)
                    print("Columns that match with input_schema: ",file=f)
                    for j in matching_tables:
                        print(j+' : ',matching_tables_dict[j],file=f)
                    print("Coverage Score : ",cov,"\t Completeness Score : ",comp,"\t Number of Rows : ",len(res),file=f)
                    print(file=f)
                    no_of_rows["output_"+str(ctr)+op_str2]=len(res)
                    if comp in comp_score:
                        comp_score[comp].append("output_"+str(ctr)+op_str2)
                    else:
                        comp_score[comp]=[]
                        comp_score[comp].append("output_"+str(ctr)+op_str2)
        display_individual_matches(matching_tables,matching_tables_dict)
    else:
        ctr=0
        count_comb=len(matching_tables)
        all_combos=generate_all_combinations(matching_tables)
        for i in range(count_comb,1,-1):
            for l in all_combos[i]:
                a=list(l)
                comb=list(combinations(a,2))
                flag=0
                for pair in comb:
                    if flag==0:
                        t1,t2=pair
                        if ((t1+' : '+t2) in cat_sim):
                            if (cat_sim[t1+' : '+t2]>=.50 and col_sim[t1+' : '+t2]>=0.50):
                                pass
                            else:
                                flag=1
                        else:
                            flag=1
                    else:
                        break
                if flag==0:
                    res=merge_list(a)
                    if type(res) != type(-1):
                        ctr+=1
                        op_string=op_str1+str(ctr)+op_str2
                        res.to_csv(op_string,sep=',', index=False)
                        cols=res.columns
                        cov,comp=coverage_and_completeness(res)
                        with open("output_folder_transformation/results.txt",'a') as f:
                            print(str(ctr)+op_str2+' : ',end='',file=f)
                            print(str(ctr)+op_str2+' : ',end='')
                            for j in a:
                                print(j+'\t\t',end='',file=f)
                                print(j+'\t\t',end='')
                            print(file=f)
                            print()
                            if transformations!=-1:
                                l=get_possible_transformations(cols)
                                print("possible transformations are : ",l,file=f)
                                print("possible transformations are : ",l)
                            if stats!=-1:
                                sat,unsat=check_stats(res)
                                if len(sat)>0:
                                    print("Conditions Satisfied : ",sat,file=f)
                                    print("Conditions Satisfied : ",sat)
                                if len(unsat)>0:
                                    print("Conditions Not Satisfied : ",unsat,file=f)
                                    print("Conditions Not Satisfied : ",unsat)
                            print('Missing Values(NANs): ',nan_score(table=res),file=f)
                            print('Missing Values(NANs): ',nan_score(table=res))
                            print("Columns that match with input_schema: ",file=f)
                            print("Columns that match with input_schema: ")
                            for j in a:
                                print(j+' : ',matching_tables_dict[j],file=f)
                                print(j+' : ',matching_tables_dict[j])
                            print("Coverage Score : ",cov,"\t Completeness Score : ",comp,file=f)
                            print("Coverage Score : ",cov,"\t Completeness Score : ",comp)
                            print()
                            print(file=f)
                            no_of_rows["output_"+str(ctr)+op_str2]=len(res)
                            if comp in comp_score:
                                comp_score[comp].append("output_"+str(ctr)+op_str2)
                            else:
                                comp_score[comp]=[]
                                comp_score[comp].append("output_"+str(ctr)+op_str2)
        display_individual_matches(matching_tables,matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)
    ranking_display(comp_score,no_of_rows)

In [None]:
# only schema and individual tables
s='{"schema": {"round": "int64", "Round 1": "object", "Circuit": "object", "Date": "object", "Pole Position": "object", "Fastest Lap": "object", "Winning Driver": "object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
matching_tables_dict=check_possible_matches()
matching_tables=list(matching_tables_dict)
print(matching_tables)
for i in matching_tables:
    print(i," : ",all_tablenames[i])
print("\n")
display_individual_matches(matching_tables,matching_tables_dict)
ranking_display(comp_score,no_of_rows)

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'date': 'object', 'pole position': 'object', 'fastest lap': 'object', 'winning driver': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema

['203_181.csv', '203_408.csv', '203_514.csv', '203_742.csv', '204_253.csv', '204_40.csv', '204_455.csv', '204_569.csv', '204_63.csv', '204_845.csv']
203_181.csv  :  1990 IndyCar season
203_408.csv  :  1989 Formula One season
203_514.csv  :  2008 Superleague Formula season
203_742.csv  :  1995 IndyCar season
204_253.csv  :  1990 Superbike World Championship season
204_40.csv  :  2008 Superbike World Championship season
204_455.csv  :  1989 Formula One season
204_569.csv  :  1998 Swedish Touring Car Championship season
204_63.csv  :  2002 Italian Formula Three season
204_845.csv  :  2003 Barber Dodge Pro Series season


203_181.csv(1990 IndyCar season)
Missing Values(NANs score):  {'date': '0/17', 'rnd': '0/17'

In [None]:
l=['203_181.csv', '203_408.csv', '203_514.csv', '203_742.csv', '204_253.csv', '204_40.csv', '204_455.csv', '204_569.csv', '204_63.csv', '204_845.csv']
for i in l:
    x=pd.read_csv(i)
    print(i," : ",all_tablenames[i])
    display(x)

203_181.csv  :  1990 IndyCar season


Unnamed: 0,date,rnd,race name,circuit,city/location,pole position,winning driver,winning team,report
0,1,April 8,Autoworks 200,Phoenix International Raceway,"Phoenix, Arizona",Rick Mears,Rick Mears,Team Penske,Report
1,2,April 22,Toyota Long Beach Grand Prix,Streets of Long Beach,"Long Beach, California","Al Unser, Jr.","Al Unser, Jr.",Galles-Kraco Racing,Report
2,3,May 27,74th Indianapolis 500,Indianapolis Motor Speedway,"Speedway, Indiana",Emerson Fittipaldi,Arie Luyendyk,Doug Shierson Racing,Report
3,4,June 3,Miller Genuine Draft 200,Milwaukee Mile,"West Allis, Wisconsin",Rick Mears,"Al Unser, Jr.",Galles-Kraco Racing,Report
4,5,June 17,Valvoline Grand Prix of Detroit,Streets of Detroit,"Detroit, Michigan",Michael Andretti,Michael Andretti,Newman/Haas Racing,Report
5,6,June 24,Budweiser/G.I.Joe's 200,Portland International Raceway,"Portland, Oregon",Danny Sullivan,Michael Andretti,Newman/Haas Racing,Report
6,7,July 8,Budweiser Grand Prix of Cleveland,Cleveland Burke Lakefront Airport,"Cleveland, Ohio",Rick Mears,Danny Sullivan,Team Penske,Report
7,8,July 15,Marlboro Grand Prix at the Meadowlands,Meadowlands Sports Complex,"East Rutherford, New Jersey",Michael Andretti,Michael Andretti,Newman/Haas Racing,Report
8,9,July 22,Molson Indy Toronto,Exhibition Place,"Toronto, Ontario",Danny Sullivan,"Al Unser, Jr.",Galles-Kraco Racing,Report
9,10,August 5,Marlboro 500,Michigan International Speedway,"Brooklyn, Michigan",Emerson Fittipaldi,"Al Unser, Jr.",Galles-Kraco Racing,Report


203_408.csv  :  1989 Formula One season


Unnamed: 0,rd,grand prix,date,location,pole position,fastest lap,winning driver,constructor,report
0,1,Brazilian Grand Prix,26 March,Jacarepaguá,Ayrton Senna,Riccardo Patrese,Nigel Mansell,Ferrari,Report
1,2,San Marino Grand Prix,23 April,Imola,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
2,3,Monaco Grand Prix,7 May,Monaco,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
3,4,Mexican Grand Prix,28 May,Hermanos Rodriguez,Ayrton Senna,Nigel Mansell,Ayrton Senna,McLaren-Honda,Report
4,5,United States Grand Prix,4 June,Phoenix,Ayrton Senna,Ayrton Senna,Alain Prost,McLaren-Honda,Report
5,6,Canadian Grand Prix,18 June,Circuit Gilles Villeneuve,Alain Prost,Jonathan Palmer,Thierry Boutsen,Williams-Renault,Report
6,7,French Grand Prix,9 July,Paul Ricard,Alain Prost,Mauricio Gugelmin,Alain Prost,McLaren-Honda,Report
7,8,British Grand Prix,16 July,Silverstone,Ayrton Senna,Nigel Mansell,Alain Prost,McLaren-Honda,Report
8,9,German Grand Prix,30 July,Hockenheimring,Ayrton Senna,Ayrton Senna,Ayrton Senna,McLaren-Honda,Report
9,10,Hungarian Grand Prix,13 August,Hungaroring,Riccardo Patrese,Nigel Mansell,Nigel Mansell,Ferrari,Report


203_514.csv  :  2008 Superleague Formula season


Unnamed: 0,round,round 1,race,date,pole position,fastest lap,winning club,winning team,report
0,1,R1,Donington Park,August 31,Beijing Guoan,Beijing Guoan,Beijing Guoan,Zakspeed,Report
1,1,R2,Donington Park,August 31,,PSV Eindhoven,Sevilla FC,GTA Motor Competicion,Report
2,2,R1,Nurburgring,September 21,A.C. Milan,PSV Eindhoven,A.C. Milan,Scuderia Playteam,Report
3,2,R2,Nurburgring,September 21,,SC Corinthians,PSV Eindhoven,Azerti Motorsport,Report
4,3,R1,Zolder,October 5,Borussia Dortmund,Liverpool F.C.,Liverpool F.C.,Hitech Junior Team,Report
5,3,R2,Zolder,October 5,,Atletico Madrid,Beijing Guoan,Zakspeed,Report
6,4,R1,Estoril,October 19,A.S. Roma,Atletico Madrid,Liverpool F.C.,Hitech Junior Team,Report
7,4,R2,Estoril,October 19,,Borussia Dortmund,Al Ain,Azerti Motorsport,Report
8,5,R1,Vallelunga,November 2,Liverpool F.C.,Beijing Guoan,Beijing Guoan,Zakspeed,Report
9,5,R2,Vallelunga,November 2,,Atletico Madrid,F.C. Porto,Hitech Junior Team,Report


203_742.csv  :  1995 IndyCar season


Unnamed: 0,rnd,date,race name,circuit,city/location,pole position,fastest lap,winning driver,winning team,report
0,1,March 5,Marlboro Grand Prix of Miami Presented by Toyota,Bicentennial Park,"Miami, Florida",Michael Andretti,Scott Pruett,Jacques Villeneuve,Team Green,Report
1,2,March 19,Australian IndyCar Grand Prix,Surfers Paradise Street Circuit,"Surfers Paradise, Australia",Michael Andretti,Michael Andretti,Paul Tracy,Newman/Haas Racing,Report
2,3,April 2,Slick 50 200,Phoenix International Raceway,"Phoenix, Arizona",Bryan Herta,Emerson Fittipaldi,Robby Gordon,Walker Racing,Report
3,4,April 9,Toyota Grand Prix of Long Beach,Streets of Long Beach,"Long Beach, California",Michael Andretti,Michael Andretti,"Al Unser, Jr.",Marlboro Team Penske,Report
4,5,April 23,Bosch Spark Plug Grand Prix,Nazareth Speedway,"Nazareth, Pennsylvania",Robby Gordon,Emerson Fittipaldi,Emerson Fittipaldi,Marlboro Team Penske,Report
5,6,May 28,79th Indianapolis 500*,Indianapolis Motor Speedway,"Speedway, Indiana",Scott Brayton,Scott Goodyear,Jacques Villeneuve,Team Green,Report
6,7,June 4,Miller Genuine Draft 200,Milwaukee Mile,"West Allis, Wisconsin",Teo Fabi,Teo Fabi,Paul Tracy,Newman/Haas Racing,Report
7,8,June 11,ITT Automotive Grand Prix of Detroit,The Raceway on Belle Isle Park,"Detroit, Michigan",Robby Gordon,Michael Andretti,Robby Gordon,Walker Racing,Report
8,9,June 25,Budweiser/G. I. Joe's 200,Portland International Raceway,"Portland, Oregon",Jacques Villeneuve,"Al Unser, Jr.","Al Unser, Jr.",Marlboro Team Penske,Report
9,10,July 9,Texaco/Havoline 200,Road America,"Elkhart Lake, Wisconsin",Jacques Villeneuve,Jacques Villeneuve,Jacques Villeneuve,Team Green,Report


204_253.csv  :  1990 Superbike World Championship season


Unnamed: 0,round,round 1,circuit,date,pole position,fastest lap,winning rider
0,1,R1,Jerez,18 March,Raymond Roche,Stephane Mertens,Raymond Roche
1,1,R2,Jerez,18 March,Raymond Roche,Raymond Roche,Raymond Roche
2,2,R1,Donington,16 April,Giancarlo Falappa,Rob Phillis,Fred Merkel
3,2,R2,Donington,16 April,Giancarlo Falappa,Raymond Roche,Giancarlo Falappa
4,3,R1,Hungaroring,30 April,Malcolm Campbell,Raymond Roche,Fred Merkel
5,3,R2,Hungaroring,30 April,Malcolm Campbell,Fred Merkel,Raymond Roche
6,4,R1,Hockenheim,6 May,Raymond Roche,Fred Merkel,Fred Merkel
7,4,R2,Hockenheim,6 May,Raymond Roche,Raymond Roche,Stephane Mertens
8,5,R1,Mosport,3 June,Giancarlo Falappa,Raymond Roche,Raymond Roche
9,5,R2,Mosport,3 June,Giancarlo Falappa,Jamie James,Raymond Roche


204_40.csv  :  2008 Superbike World Championship season


Unnamed: 0,round,round 1,country,circuit,date,pole position,fastest lap,winning rider,winning team,report
0,1,R1,Qatar,Losail,23 February,Troy Corser,Noriyuki Haga,Troy Bayliss,Xerox Ducati,Report
1,1,R2,Qatar,Losail,23 February,Troy Corser,Fonsi Nieto,Fonsi Nieto,Alstare Suzuki,Report
2,2,R1,Australia,Phillip Island,2 March,Troy Bayliss,Troy Bayliss,Troy Bayliss,Xerox Ducati,Report
3,2,R2,Australia,Phillip Island,2 March,Troy Bayliss,Max Biaggi,Troy Bayliss,Xerox Ducati,Report
4,3,R1,Spain,Valencia,6 April,Max Neukirchner,Noriyuki Haga,Lorenzo Lanzi,Team R.G,Report
5,3,R2,Spain,Valencia,6 April,Max Neukirchner,Carlos Checa,Noriyuki Haga,Yamaha Motor Italia,Report
6,4,R1,Netherlands,Assen,27 April,Troy Bayliss,Max Neukirchner,Troy Bayliss,Xerox Ducati,Report
7,4,R2,Netherlands,Assen,27 April,Troy Bayliss,Troy Bayliss,Troy Bayliss,Xerox Ducati,Report
8,5,R1,Italy,Monza,11 May,Troy Bayliss,Noriyuki Haga,Max Neukirchner,Alstare Suzuki,Report
9,5,R2,Italy,Monza,11 May,Troy Bayliss,Noriyuki Haga,Noriyuki Haga,Yamaha Motor Italia,Report


204_455.csv  :  1989 Formula One season


Unnamed: 0,rd,grand prix,date,location,pole position,fastest lap,winning driver,constructor,report
0,1,Brazilian Grand Prix,26 March,Jacarepaguá,Ayrton Senna,Riccardo Patrese,Nigel Mansell,Ferrari,Report
1,2,San Marino Grand Prix,23 April,Imola,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
2,3,Monaco Grand Prix,7 May,Monaco,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
3,4,Mexican Grand Prix,28 May,Hermanos Rodriguez,Ayrton Senna,Nigel Mansell,Ayrton Senna,McLaren-Honda,Report
4,5,United States Grand Prix,4 June,Phoenix,Ayrton Senna,Ayrton Senna,Alain Prost,McLaren-Honda,Report
5,6,Canadian Grand Prix,18 June,Circuit Gilles Villeneuve,Alain Prost,Jonathan Palmer,Thierry Boutsen,Williams-Renault,Report
6,7,French Grand Prix,9 July,Paul Ricard,Alain Prost,Mauricio Gugelmin,Alain Prost,McLaren-Honda,Report
7,8,British Grand Prix,16 July,Silverstone,Ayrton Senna,Nigel Mansell,Alain Prost,McLaren-Honda,Report
8,9,German Grand Prix,30 July,Hockenheimring,Ayrton Senna,Ayrton Senna,Ayrton Senna,McLaren-Honda,Report
9,10,Hungarian Grand Prix,13 August,Hungaroring,Riccardo Patrese,Nigel Mansell,Nigel Mansell,Ferrari,Report


204_569.csv  :  1998 Swedish Touring Car Championship season


Unnamed: 0,round,round 1,circuit,date,pole position,fastest lap,winning driver,winning team,winning privateer
0,1,R1,Mantorp Park,10 May,Mattias Ekström,Fredrik Ekblom,Mats Linden,Kristoffersson Motorsport,Richard Göransson
1,1,R2,Mantorp Park,10 May,,Peggen Andersson,Fredrik Ekblom,BMW Dealer Team,Pontus Mörth
2,2,R3,Karlskoga-Gelleråsen,31 May,Jan Nilsson,Jan Nilsson,Jan Nilsson,Flash Engineering,Pontus Mörth
3,2,R4,Karlskoga-Gelleråsen,31 May,,Fredrik Ekblom,Peggen Andersson,BMW Dealer Team,Pontus Mörth
4,3,R5,Anderstorp,28 June,Mattias Ekström,Fredrik Ekblom,Jan Nilsson,Flash Engineering,Pontus Mörth
5,3,R6,Anderstorp,28 June,,Peggen Andersson,Peggen Andersson,BMW Dealer Team,Georg Bakajev
6,4,R7,Falkenberg,9 July,Jens Edman,Jens Edman,Jan Nilsson,Flash Engineering,Pontus Mörth
7,4,R8,Falkenberg,9 July,,Mattias Ekström,Jens Edman,Flash Engineering,Pontus Mörth
8,5,R9,Ring Knutstorp,6 September,Fredrik Ekblom,Fredrik Ekblom,Fredrik Ekblom,BMW Dealer Team,Pontus Mörth
9,5,R10,Ring Knutstorp,6 September,,Mats Linden,Jan Nilsson,Flash Engineering,Georg Bakajev


204_63.csv  :  2002 Italian Formula Three season


Unnamed: 0,round,circuit,date,pole position,winning driver,winning team,trophy winner
0,1,"ACI Vallelunga Circuit, Campagnano di Roma",7 April,Miloš Pavlović,Miloš Pavlović,Target Racing,Nino Famà
1,2,Misano World Circuit,28 April,Miloš Pavlović,Miloš Pavlović,Target Racing,Giovanni Faraonio
2,3,"Autodromo di Pergusa, Enna",19 May,Miloš Pavlović,Philip Cloostermans,Azeta Racing,Carmine Tancredi
3,4,Autodromo Nazionale Monza,30 June,Philip Cloostermans,Philip Cloostermans,Azeta Racing,Carmine Tancredi
4,5,"Autodromo Riccardo Paletti, Varano",21 July,Miloš Pavlović,Miloš Pavlović,Target Racing,Carmine Tancredi
5,6,"Autodromo Enzo e Dino Ferrari, Imola",1 September,Vitantonio Liuzzi,Vitantonio Liuzzi,Bertram Schäfer Racing,Silvio Alberti
6,7,"Autodromo del Levante, Binetto",8 September,Christiano Citron,Miloš Pavlović,Target Racing,Carmine Tancredi
7,8,"Mugello Circuit, Scarperia",6 October,Miloš Pavlović,Miloš Pavlović,Target Racing,Dino Lusuardi
8,9,"Autodromo dell'Umbria, Magione",20 October,Andreas Zuber,Christiano Citron,Target Racing,Alberto Morelli


204_845.csv  :  2003 Barber Dodge Pro Series season


Unnamed: 0,round,circuit,location,date,pole position,fastest lap,winning driver,headline event
0,1,Albert Whitted Airport,"St. Petersburg, Florida",February 23,Leonardo Maia,Dan Di Leo,Leonardo Maia,Grand Prix of St. Petersburg
1,2,Fundidora park,Monterrey,March 23,Leonardo Maia,David Martinez,David Martinez,Monterrey Grand Prix
2,3,Milwaukee Mile,"West Allis, Wisconsin",June 1,"Victor Gonzalez, Jr.",Scott Poirier,Leonardo Maia,Milwaukee Mile Centennial 250
3,4,Mazda Raceway Laguna Seca,"Monterey, California",June 15,Memo Rojas,Leonardo Maia,Dan Di Leo,Grand Prix of Monterey
4,5,Portland International Raceway,"Portland, Oregon",June 22,Leonardo Maia,Leonardo Maia,Leonardo Maia,G.I. Joe's 200
5,6,Burke Lakefront Airport,"Cleveland, Ohio",July 5,Leonardo Maia,Leonardo Maia,Leonardo Maia,U.S. Bank Cleveland Grand Prix
6,7,Exhibition Place,Toronto,July 13,Leonardo Maia,Leonardo Maia,Memo Rojas,Molson Indy Toronto
7,8,Concord Pacific Place,Vancouver,July 27,Leonardo Maia,David Martinez,Leonardo Maia,Molson Indy Vancouver
8,9,Mid-Ohio Sports Car Course,"Lexington, Ohio",August 3,Leonardo Maia,Leonardo Maia,Leonardo Maia,Champ Car Grand Prix of Mid-Ohio
9,10,Circuit Gilles Villeneuve,Montreal,August 24,Memo Rojas,Leonardo Maia,Memo Rojas,Molson Indy Montreal


In [None]:
# only schema with merge
s='{"schema": {"round": "int64", "Round 1": "object", "Circuit": "object", "Date": "object", "Pole Position": "object", "Fastest Lap": "object", "Winning Driver": "object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
get_matches()

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'date': 'object', 'pole position': 'object', 'fastest lap': 'object', 'winning driver': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema

Matching Tables :  ['203_181.csv', '203_408.csv', '203_514.csv', '203_742.csv', '204_253.csv', '204_40.csv', '204_455.csv', '204_569.csv', '204_63.csv', '204_845.csv']
203_181.csv 	 1990 IndyCar season
203_408.csv 	 1989 Formula One season
203_514.csv 	 2008 Superleague Formula season
203_742.csv 	 1995 IndyCar season
204_253.csv 	 1990 Superbike World Championship season
204_40.csv 	 2008 Superbike World Championship season
204_455.csv 	 1989 Formula One season
204_569.csv 	 1998 Swedish Touring Car Championship season
204_63.csv 	 2002 Italian Formula Three season
204_845.csv 	 2003 Barber Dodge Pro Series season



In [None]:
l=['output_folder_transformation/1.csv', 'output_folder_transformation/2.csv', 'output_folder_transformation/3.csv', 'output_folder_transformation/4.csv', 'output_folder_transformation/5.csv', 'output_folder_transformation/6.csv', 'output_folder_transformation/7.csv', 'output_folder_transformation/8.csv', 'output_folder_transformation/9.csv', 'output_folder_transformation/10.csv', 'output_folder_transformation/11.csv', 'output_folder_transformation/12.csv', 'output_folder_transformation/13.csv', '203_181.csv', '203_408.csv', '203_514.csv', '203_742.csv', '204_253.csv', '204_40.csv', '204_455.csv', '204_569.csv', '204_63.csv', '204_845.csv']
for i in l:
    x=pd.read_csv(i)
    print(i)
    display(x)

In [None]:
# schema and categories
s='{"schema": {"Round": "int64", "Round 1": "object", "Circuit": "object", "Date": "object", "Pole Position": "object", "Fastest Lap": "object", "Winning Driver": "object"},"categories": ["motorsport", "car", "seasons"]}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
get_matches()

In [None]:
l=['output_folder_transformation/1.csv', 'output_folder_transformation/2.csv', 'output_folder_transformation/3.csv', 'output_folder_transformation/4.csv', 'output_folder_transformation/5.csv', 'output_folder_transformation/6.csv', '203_181.csv', '203_514.csv', '203_742.csv', '204_253.csv', '204_569.csv', '204_63.csv']
for i in l:
    x=pd.read_csv(i)
    print(i)
    display(x)

In [None]:
# schema, transformations and content based conditions
s='{"schema": {"date": "object", "nationality": "object", "tonnage (grt)": "int64", "fate": "object"},"transformations":{"tonnage (grt)":["sum","funct1","avg","minimum"],"nationality":["funct1","sum"]},"stats":{"tonnage (grt)":["minimum>0","maximum<10000"]}}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
get_matches()

In [None]:
l=["202_117.csv",'203_148.csv', '203_268.csv', '204_100.csv',"output_folder_transformation/1.csv","output_folder_transformation/2.csv","output_folder_transformation/3.csv","output_folder_transformation/4.csv"]
for i in l:
    x=pd.read_csv(i)
    print(i)
    display(x)

In [None]:
# Special Function to generate scores for a single table/schema that is provided by th user
def get_scores(input_schema,check_schema,check_tablename=-1):
    print("Requirements : ",input_schema)
    print("Schema to be scored : ",check_schema)
    input_sch_onto=generate_ontology(input_schema)
    cols1={col1:col1 for col1,d_type1 in check_schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
    check_schema={k:v for k,v in check_schema.items() if k not in list(cols1.values())}
    cols2={col1:col for col1,d_type1 in check_schema.items() for col in input_sch_onto if col1 in input_sch_onto[col] and d_type1==input_schema[col]}
    check_schema={k:v for k,v in check_schema.items() if k not in cols1}
    cols={**cols1,**cols2}
    ip_schema={k:v for k,v in input_schema.items() if k not in list(cols.values())}
    for col1,d_type1 in ip_schema.items():
        x=[col2 for col2,d_type2 in check_schema.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[x[0]]=col1
    print("Matching Columns : ",cols)
    print("Coverage : ",len(cols)/len(input_schema))
    if check_tablename!=-1:
        x=pd.read_csv(check_tablename)
        l=len(x)
        print("NULL scores : ",nan_score(table=x))
        comp=0.0
        for col in cols:
            comp+=((l-sum(pd.isnull(x[col])))/l)
        comp=comp/len(input_schema)
        print("Completeness score : ",comp)
    else:
        print("Table Not Available. Completeness and Null Scores not applicable.")

In [None]:
# when schema and table is provided to function all scores are generated
ip_sch={'day': 'object', 'name of ship': 'object', 'nation': 'object', 'tonnage': 'int64', 'fate': 'object'}
sch={'date': 'object', 'name': 'object', 'nationality': 'object', 'tonnage (grt)': 'int64', 'fate': 'object'}
tname='203_268.csv'
get_scores(ip_sch,sch,tname)

In [None]:
# when only schema is provided only coverage generated, no completeness score
ip_sch={'day': 'object', 'name of ship': 'object', 'nation': 'object', 'tonnage': 'int64', 'fate': 'object'}
sch={'date': 'object', 'name': 'object', 'nationality': 'object', 'tonnage (grt)': 'int64', 'fate': 'object'}
get_scores(ip_sch,sch)