In [1]:
# import necessary libraries and packages
import json
import math
import pandas as pd
import numpy as np
import urllib
from itertools import combinations
from nltk.corpus import wordnet
from IPython.display import display

In [2]:
#preloading all schemas and categories
all_schemas={}
all_categories={}
all_tablenames={}
with open("final_schema.txt") as ip_file:
    for line in ip_file.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]=json_obj["schema"]
        all_categories[json_obj["filename"]]=json_obj["categories"]
        all_tablenames[json_obj["filename"]]=json_obj["tablename"]
#preloading column and category similarity values of tables
with open("cos_similarity.txt",'r') as ip_file:
    json_object=json.load(ip_file)
    col_sim = json_object["column_similarity"]
    cat_sim = json_object["category_similarity"]

In [3]:
# all functions needed to generate ontologies

# generates all synonyms and hypernyms of a term using wordnet
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)

# takes input list and returns ontology as dictionary with every word in list as the key
def generate_ontology(list1):
    ontology={}
    for word in list1:
        ontology[word]=get_synonyms(word)
    return ontology

In [4]:
# making a list of transformation functions
tf_onto={}
transform_funct_list={"int64":['average','sum','maximum','minimum','range','median','variance','standard deviation','mode','frequency','avg'],"float64":['average','sum','maximum','minimum','range','median','variance','standard deviation','mode','frequency','avg'],"object":["tolower","toupper","substring","funct1","funct2","funct3"]}
for dtype,funct_list in transform_funct_list.items():
    tf_onto[dtype]=generate_ontology(funct_list)
# initialising knowledge graph data dictionary
kg_data={}
# making a list of all statistical functions and the various signs and combinations 
signs = ["<=",">=","<",">","=","!="]; functs =  ['count','mean','standard deviation','min','max','25%','50%','75%']; functs_onto = generate_ontology(functs)
functs_onto['25%'].append("first quartile");functs_onto['75%'].append("third quartile");functs_onto['50%'].append("second quartile");functs_onto['min'].append("minimum");functs_onto['max'].append("maximum")

In [5]:
# to generate cos similarity between two lists
def cos_sim(list1, list2):
    terms = set(list1).union(list2)
    intersect = set(list1) & set(list2)
    others = (set(list1)-intersect).union(set(list2)-intersect)
    product=0
    for word in terms:
        if word in intersect:
            product+=1
    l1mag = math.sqrt(len(list1))
    l2mag = math.sqrt(len(list2))
    if len(list1)==0 or len(list2)==0:
        return 0.0
    else:
        return product / (l1mag * l2mag)

In [6]:
# returns a dictionary of all tables having one or more columns mstching directly with the input schema
def direct_matches(input_schema):
    possible_tables={}
    for file in all_schemas:
        schema=all_schemas[file]
        cols={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
        if(len(cols)>1):
            possible_tables[file]=cols
    return possible_tables

# return only those tables having 75% cos similarity with input table through direct match
def direct_matches_only():
    input_sch_onto=generate_ontology(input_schema)
    possible_tables=direct_matches(input_schema)
    matching_tables={}
    for i in possible_tables:
        cos_val=cos_sim(list(possible_tables[i].values()),list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    print("Matching Tables ")
    for i in list(matching_tables):
        print(i," : ",all_tablenames[i])
    print("\n")
    display_individual_matches(list(matching_tables),matching_tables)

In [54]:
# returns a dictionary of all tables having one or more columns mstching directly, or having an ontology match 
# or a substring match with the input schema
def extended_matches(input_schema):
    possible_tables={}
    ip_schema=input_schema
    for file in all_schemas:
        schema=all_schemas[file]
        cols1={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
        schema={k:v for k,v in schema.items() if k not in cols1}
        input_schema={k:v for k,v in input_schema.items() if k not in list(cols1.values())}
        input_sch_onto=generate_ontology(input_schema)
        cols2={col1:col for col1,d_type1 in schema.items() for col in input_sch_onto if col1 in input_sch_onto[col] and d_type1==input_schema[col]}
        schema={k:v for k,v in schema.items() if k not in cols2}
        input_schema={k:v for k,v in input_schema.items() if k not in list(cols2.values())}
        schema_onto=generate_ontology(schema)
        cols3={col:col1 for col1,d_type1 in input_schema.items() for col in schema_onto if col1 in schema_onto[col] and d_type1==schema[col]}
        schema={k:v for k,v in schema.items() if k not in cols3}
        input_schema={k:v for k,v in input_schema.items() if k not in list(cols3.values())}
        cols={**cols1,**cols2,**cols3}
        for col1,d_type1 in input_schema.items():
            x=[col2 for col2,d_type2 in schema.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
            if len(x)==1:
                cols[x[0]]=col1
        if(len(cols)>1):
            possible_tables[file]=cols
        input_schema=ip_schema
    return possible_tables

# return only those tables having 75% cos similarity with input table through direct, ontology and substring match
def check_possible_extended_matches():
    input_sch_onto=generate_ontology(input_schema)
    possible_tables=extended_matches(input_schema)
    matching_tables={}
    for i in possible_tables:
        schema=all_schemas[i]
        schema=[k for k in schema if k not in possible_tables[i]]
        cos_val=cos_sim(list(possible_tables[i].values())+schema,list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    print("Matching Tables")
    for i in list(matching_tables):
        print(i," : ",all_tablenames[i])
    print("\n")
    display_individual_matches(list(matching_tables),matching_tables)
    return matching_tables

In [55]:
# categories provided as input help narrow down the matches to contextually relevant tables
# categories and columns are matched based on direct match as well as ontology match
def categories_included(input_categories,input_schema):
    input_cat_onto=generate_ontology(input_categories)
    ip_schema=input_schema
    possible_tables={}
    for file in all_categories:
        category=all_categories[file]
        cats1=[cat1 for cat1 in category if cat1 in input_categories]
        category=[k for k in category if k not in cats1]
        cats2=[cat2 for cat1 in category for cat2 in input_cat_onto if cat1 in input_cat_onto[cat2]]
        category=[k for k in category if k not in cats2]
        cat_list=cats1+cats2
        cos_val=cos_sim(cat_list,input_categories)
        if cos_val > 0.75 :
            schema=all_schemas[file]
            cols1={col1:col1 for col1,d_type1 in schema.items() if col1 in input_schema and d_type1==input_schema[col1]}
            schema={k:v for k,v in schema.items() if k not in list(cols1.values())}
            input_schema={k:v for k,v in input_schema.items() if k not in list(cols1.values())}
            input_sch_onto=generate_ontology(input_schema)
            cols2={col1:col for col1,d_type1 in schema.items() for col in input_sch_onto if col1 in input_sch_onto[col] and d_type1==input_schema[col]}
            schema={k:v for k,v in schema.items() if k not in cols2}
            input_schema={k:v for k,v in input_schema.items() if k not in list(cols2.values())}
            schema_onto=generate_ontology(schema)
            cols3={col:col1 for col1,d_type1 in input_schema.items() for col in schema_onto if col1 in schema_onto[col] and d_type1==schema[col]}
            schema={k:v for k,v in schema.items() if k not in cols3}
            input_schema={k:v for k,v in input_schema.items() if k not in list(cols3.values())}
            cols={**cols1,**cols2,**cols3}
            for col1,d_type1 in input_schema.items():
                x=[col2 for col2,d_type2 in schema.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
                if len(x)==1:
                    cols[x[0]]=col1
            if(len(cols)>1):
                possible_tables[file]=cols
            input_schema=ip_schema
    return possible_tables

# return tables having 75% column as well as category cos similarity
def check_possible_category_included_matches():
    input_sch_onto=generate_ontology(input_schema)
    possible_tables=categories_included(input_categories,input_schema)
    matching_tables={}
    for i in possible_tables:
        schema=all_schemas[i]
        schema=[k for k in schema if k not in possible_tables[i]]
        cos_val=cos_sim(list(possible_tables[i].values())+schema,list(input_schema))
        if cos_val>0.75:
            matching_tables[i]=possible_tables[i]
    print("Matching Tables")
    for i in list(matching_tables):
        print(i," : ",all_tablenames[i])
    print("\n")
    display_individual_matches(list(matching_tables),matching_tables)
    return matching_tables

In [9]:
# function generates all possible combinations of list l taking elements n to 2 at a time and returns a dictionary
def generate_all_combinations(l):
    x={}
    a=len(l)
    for i in range(a,1,-1):
        x[i]=list(combinations(l,i))
    return x

In [60]:
# for input table and column, get a list of all categories that the terms(elements) in that column belong to
def extract_info_from_knowledge_graphs(a,col1):
    with open("C:\\Users\\adith\\Desktop\\my_google_knowledge_graph_api_key.txt","r") as f:
        api_key=f.readline()
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {'limit': 50,'indent': True,'key': api_key}
    s={}
    ctr=0
    for row in a[col1]:
        if type(row)==type(np.nan):
            ctr+=1
            continue
        params['query']=row
        url = service_url + '?' + urllib.parse.urlencode(params)
        try:
            response = json.loads(urllib.request.urlopen(url).read())
        except urllib.error.HTTPError as httperr:
            try:
                response = json.loads(urllib.request.urlopen(url).read())
            except urllib.error.HTTPError as httperr:
                try:
                    response = json.loads(urllib.request.urlopen(url).read())
                    for element in response['itemListElement']:
                        for i in element["result"]["@type"]:
                            b=i.lower()
                            if b in s:
                                s[b]+=1
                            else:
                                s[b]=1
                        if "description" in element["result"]:
                            b=element["result"]["description"].lower()
                            if b in s:
                                s[b]+=1
                            else:
                                s[b]=1
                except urllib.error.HTTPError as httperr:
                    ctr+=1
    return s,ctr

In [11]:
# using data from google knowledge graphs, try to obtain alternate names for columns that do not match to merge them if they
# represent the same entity
# returned values are in the form (final_name,old_name)
def get_alternate_col_name(a,b,col1,col2):
    global kg_data
    if col1 not in kg_data:
        groups,count=extract_info_from_knowledge_graphs(a,col1)
        kg_data[col1]={"groups":groups,"count":count}
    if col2 not in kg_data:
        groups,count=extract_info_from_knowledge_graphs(b,col2)
        kg_data[col2]={"groups":groups,"count":count}
    s1=kg_data[col1]["groups"]
    ctr1=kg_data[col1]["count"]
    s2=kg_data[col2]["groups"]
    ctr2=kg_data[col2]["count"]
    if (col1 in s1) and (col1 in s2):
        kg_data[col1]["groups"][col1]+=kg_data[col2]["groups"][col1]
        kg_data[col1]["count"]+=kg_data[col2]["count"]
        del kg_data[col2]
        return col1,col2
    elif (col2 in s1) and (col2 in s2):
        kg_data[col2]["groups"][col2]+=kg_data[col1]["groups"][col2]
        kg_data[col2]["count"]+=kg_data[col1]["count"]
        del kg_data[col1]
        return col2,col1
    else:
        return 0,0

In [12]:
# using google knowledge graph update the column names and update tables (if possible) to improve merge accuracy and improve completeness
def graph_match(c,t1,t2):
    l=list(combinations(c,2))
    possible_column_renames={}
    flag=0
    for i in l:
        if ((i[0] in t1) and (i[1] in t2)):
            a,b=get_alternate_col_name(t1,t2,i[0],i[1])
        elif ((i[1] in t1) and (i[0] in t2)):
            a,b=get_alternate_col_name(t1,t2,i[1],i[0])
        else:
            a,b=0,0
        if a!=0 and b!=0:
            if b in input_schema:
                if b in t1:
                    t2.rename(columns={a:b},inplace=True)
                else:
                    t1.rename(columns={a:b},inplace=True)
            else:
                if a in t1:
                    t2.rename(columns={b:a},inplace=True)
                else:
                    t1.rename(columns={b:a},inplace=True)
            c.remove(a)
            c.remove(b)
            return t1,t2,c
    return t1,t2,0

In [13]:
def gen_kg_data(c,a,res):
    global kg_data
    for col in c:
        if col in a:
            b=a
        else:
            b=res
        if col not in kg_data:
            groups,ctr=extract_info_from_knowledge_graphs(b,col)
            kg_data[col]={"groups":groups,"count":ctr}

In [38]:
# function generates all matching columns between the res_cols schema and columns of table in fname
def generate_matching_columns(res_cols,fname):
    a=fname
    c={**res_cols,**a}
    res_onto=generate_ontology(res_cols)
    cols1={col1:col1 for col1,d_type1 in a.items() if (col1 in res_cols) and d_type1==res_cols[col1]}
    res_onto={k:v for k,v in res_onto.items() if k not in cols1}
    a={k:v for k,v in a.items() if k not in cols1}
    cols2={col1:cols for col1,d_type1 in a.items() for col in res_onto if col1 in res_onto[col] and d_type1==res_cols[col]}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[col1]=x[0]
    matching_columns=cols
    c={k:v for k,v in c.items() if k not in matching_columns and v=='object'}
    return (matching_columns,list(c))

In [15]:
def merge_list(l,matching_tables_dict):
    t1=pd.read_csv(l[0])
    t1.rename(columns=matching_tables_dict[l[0]],inplace=True)
    a={i:j for i,j in zip(t1.columns,[str(i) for i in t1.dtypes])}
    t2=pd.read_csv(l[1])
    t2.rename(columns=matching_tables_dict[l[1]],inplace=True)
    b={i:j for i,j in zip(t2.columns,[str(i) for i in t2.dtypes])}
    matching_columns,c=generate_matching_columns(a,b)
    t2.rename(columns = matching_columns,inplace=True)
    try:
        res=t1.merge(t2,how='outer')
    except:
        return -1
    for fname in l[2:]:
        res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
        t=pd.read_csv(fname)
        t.rename(columns=matching_tables_dict[fname],inplace=True)
        fname={i:j for i,j in zip(t.columns,[str(i) for i in t.dtypes])}
        matching_columns,c=generate_matching_columns(res_cols,fname)
        t.rename(columns = matching_columns,inplace=True)
        try:
            res=res.merge(t,how='outer')
        except:
            return -1
    res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
    a=input_schema
    cols1={col1:col1 for col1,d_type1 in res_cols.items() if (col1 in a) and d_type1==input_schema[col1]}
    a={k:v for k,v in a.items() if k not in cols1}
    a_onto=generate_ontology(a)
    cols2={col1:col for col1,d_type1 in res_cols.items() for col in a_onto if col1 in a_onto[col] and (d_type1==a[col])}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[x[0]]=col1
    res.rename(columns = cols,inplace=True)
    for col in res.columns:
        if col not in input_schema and (res[col].count()<(0.70*len(res))):
            res.drop(columns=[col],inplace=True)
    return res

In [62]:
# returns a merged table of all tables given in input list l    
def merge_list_knowledge_graph(l,matching_tables_dict):
    global kg_data
    kg_data={}
    t1=pd.read_csv(l[0])
    t1.rename(columns=matching_tables_dict[l[0]],inplace=True)
    a={i:j for i,j in zip(t1.columns,[str(i) for i in t1.dtypes])}
    t2=pd.read_csv(l[1])
    t2.rename(columns=matching_tables_dict[l[1]],inplace=True)
    b={i:j for i,j in zip(t2.columns,[str(i) for i in t2.dtypes])}
    matching_columns,c=generate_matching_columns(a,b)
    gen_kg_data(c,t1,t2)
    if len(c)>=2:
        t1,t2,c=graph_match(c,t1,t2)
        while c!=0:
            t1,t2,c=graph_match(c,t1,t2)
    t2.rename(columns = matching_columns,inplace=True)
    try:
        res=t1.merge(t2,how='outer')
    except:
        return -1
    for fname in l[2:]:
        res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
        t=pd.read_csv(fname)
        t.rename(columns=matching_tables_dict[fname],inplace=True)
        fname={i:j for i,j in zip(t.columns,[str(i) for i in t.dtypes])}
        matching_columns,c=generate_matching_columns(res_cols,fname)
        t.rename(columns = matching_columns,inplace=True)
        gen_kg_data(c,res,t)
        if len(c)>=2:
            res,t,c=graph_match(c,res,t)
            while c!=0:
                res,t,c=graph_match(c,res,t)
        try:
            res=res.merge(t,how='outer')
        except:
            return -1
    res_cols={i:j for i,j in zip(res.columns,[str(i) for i in res.dtypes])}
    a=input_schema
    cols1={col1:col1 for col1,d_type1 in res_cols.items() if (col1 in a) and d_type1==input_schema[col1]}
    a={k:v for k,v in a.items() if k not in cols1}
    a_onto=generate_ontology(a)
    cols2={col1:col for col1,d_type1 in res_cols.items() for col in a_onto if col1 in a_onto[col] and (d_type1==a[col])}
    a={k:v for k,v in a.items() if k not in cols2}
    cols={**cols1,**cols2}
    res_cols={k:v for k,v in res_cols.items() if k not in list(cols.values())}
    for col1,d_type1 in a.items():
        x=[col2 for col2,d_type2 in res_cols.items() if ((len(col1)>3 and len(col2)>3) and ((col1 in col2) or (col2 in col1))) and (d_type1==d_type2)]
        if len(x)==1:
            cols[x[0]]=col1
    res.rename(columns = cols,inplace=True)
    for col in res.columns:
        if col not in input_schema and (res[col].count()<(0.70*len(res))):
            res.drop(columns=[col],inplace=True)
    return res

In [17]:
# metrics calculated :
# 1)nan_score(number of nulls in each column)
# 2)coverage_score(no of matching columns with input schema/total number of columns in input schema)
# 3)completeness_score(a combination of coverage and nan scores to determine how complete the result dataset is)

# nan score = {x : (no on nans in column/no of entries in column)} where x is each column in the table
# gives the nan score(no on nans/no of entries in table) for each column in the input table
def nan_score(table=-1,fname=-1):
    if fname!=-1:
        table=pd.read_csv(fname)
    nan_count={}
    a=len(table)
    for i in table.columns:
        x=a-table[i].count()
        s=str(x)+'/'+str(a)
        nan_count[i]=s
    return nan_count

#returns the coverage score and completeness score of a given table
#coverage score is calculated as : 
# coverage = (no of columns matching with input schema/total number of columns in input schema)
#completeness score is calculated as : 
# completeness = (sum(x*(non null entries)/(total entries in the column))/total number of columns in input schema) 
#  where x=1 if column present in input schema and x=0 if column is not present in the input schema
def coverage_and_completeness(table):
    ctr=0
    comp=0.0
    cols=table.columns
    l=len(table)
    for col in cols:
        if col in input_schema:
            ctr+=1
            comp+=((l-sum(pd.isnull(table[col])))/l)
    comp=comp/len(input_schema)
    cov=ctr/len(input_schema)
    return (cov,comp)

In [18]:
def ranking_display(comp_score,no_of_rows):
    print()
    comp_rank=sorted(comp_score,reverse=True)
    count=0
    l=sorted(no_of_rows, key=lambda k: no_of_rows[k],reverse=True)
    for i in comp_rank:
        if len(comp_score[i])==1:
            count+=1
            print("Rank ",str(count).ljust(2," ")," : ",comp_score[i][0].ljust(20,' '),"completeness score : %0.16f"%(i),"\t\tnumber of rows: ",no_of_rows[comp_score[i][0]])
        else:
            for j in l:
                if j in comp_score[i]:
                    count+=1
                    print("Rank ",str(count).ljust(2," ")," : ",j.ljust(20,' '),"completeness score : %0.16f"%(i),"\t\tnumber of rows: ",no_of_rows[j])
    print("\nRanking Complete!!\n")

In [19]:
# takes output tables schema(columns and data_types) as the input, compare it with input schema and transformations required and
# returns a list of all the transformations applicable
def get_possible_transformations(cols):
    return {col:[i for i in tran if i in transform_funct_list[input_schema[col]]] for col,tran in transformations.items() if col in cols}

In [20]:
# function returns a boolean result after checking if confition is satisfied
def condition_check(sign,v,value):
    x={">=":v>=value,"<=":v<=value,"<":v<value,">":v>value,"=":v==value,"!=":v!=value}
    return x[sign]

In [21]:
# function returns all stastical conditions that are satisfied as well as unsatisfied
def check_stats(res):
    cols=res.columns; satisfied={}; unsatisfied={}
    for col in stats:
        if col in cols:
            if input_schema[col]=="int64" or input_schema[col]=="float64":
                for condition in stats[col]:
                    sign=[i for i in signs if i in condition][0]
                    funct,value=condition.split(sign)[0],condition.split(sign)[1]
                    funct=funct.lower()
                    if funct in functs_onto:
                        pass
                    else:
                        for f in functs_onto:
                            if funct in functs_onto[f]:
                                funct=f
                    v=res[col].describe()[funct]
                    if condition_check(sign,v,int(value)):
                        s=condition+"( "+str(v)+sign+value+" )"
                        if col in satisfied:
                            satisfied[col].append(s)
                        else:
                            satisfied[col]=[]
                            satisfied[col].append(s)
                    else:
                        s=condition+"( "+funct+" = "+str(v)+" )"
                        if col in unsatisfied:
                            unsatisfied[col].append(s)
                        else:
                            unsatisfied[col]=[]
                            unsatisfied[col].append(s)
        else:
            unsatisfied[col+"(Column Not Present)"]=stats[col]
    return (satisfied,unsatisfied)

In [40]:
# this is a function to print the individual tables names, their nan score, columns that match with input schema, 
# coverage score, and completeness score along with possible transformations if any
def display_individual_matches(matching_tables,matching_tables_dict):
    global comp_score
    global no_of_rows
    f=open("output_folder_transformation/results.txt",'a')
    if transformations==-1:
        flag=0
    else:
        flag=1
    for i in matching_tables:
        print(i+"("+all_tablenames[i]+")",file=f)
        print(i+"("+all_tablenames[i]+")")
        res=pd.read_csv(i)
        res.rename(columns=matching_tables_dict[i],inplace=True)
        cov,comp=coverage_and_completeness(res)
        if flag==1:
            cols=res.columns
            l=get_possible_transformations(cols)
            print("possible transformations are : ",l,file=f)
            print("possible transformations are : ",l)
            cov,comp=coverage_and_completeness(res)
        if stats!=-1:
            sat,unsat=check_stats(res)
            if len(sat)>0:
                print("Conditions Satisfied : ",sat,file=f)
                print("Conditions Satisfied : ",sat)
            if len(unsat)>0:
                print("Conditions NOT SATISFIED : ",unsat,file=f)
                print("Conditions NOT SATISFIED : ",unsat)
        print('Missing Values(NANs score): ',nan_score(fname=i),file=f)
        print('Missing Values(NANs score): ',nan_score(fname=i))
        print("Columns that match with input_schema:\n "+i+' : ',matching_tables_dict[i],file=f)
        print("Columns that match with input_schema:\n "+i+' : ',matching_tables_dict[i])
        print("Coverage Score : ",cov,"\t Completeness Score : ",comp,file=f)
        print("Coverage Score : ",cov,"\t Completeness Score : ",comp)
        print(file=f)
        print()
        no_of_rows[i]=len(res)
        if comp in comp_score:
            comp_score[comp].append(i)
        else:
            comp_score[comp]=[]
            comp_score[comp].append(i)
        display(res)

In [98]:
# the main function that is to be invoked and will call all the required functions to obtain the required matches and merges
# we have a list with all possible tables that are matches
# we generate all combinations of them in order to merge them
# a valid combination is one where every pair of tables have 50% cosine column and category similarity
# then we merge them and calculate the coverage score, null score and the completeness score
# all these details are displayed for each of the valid merges as well as individual tables
# the output is displayed in the output file "results.txt" along with the outputs in csv form
def get_matches(kg=-1):
    global comp_score
    global no_of_rows
    comp_score={}
    no_of_rows={}
    if input_categories==-1:
        print('only schema')
        matching_tables_dict=check_possible_extended_matches()
    else:
        print('category and schema')
        matching_tables_dict=check_possible_category_included_matches()
    matching_tables=list(matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)
        print(file=f)
        print("All Possible Matches",file=f)
        print(file=f)
        print("Matching Tables : ",matching_tables,file=f)
#         print("Matching Tables : ",matching_tables)
#         for i in matching_tables:
#             print(i,'\t',all_tablenames[i])
        if transformations==-1:
            print("\nNo Transformations In Input Schema",file=f)
            print(file=f)
        else:
            print('\nTransformations detected from input are : ',transformations,file=f)
            print(file=f)
    op_str1='output_folder_transformation/'
    op_str2='.csv'
    x=len(matching_tables)
    print()
    if x==0:
        with open("output_folder_transformation/results.txt",'a') as f:
            print("NO MATCHES FOUND",file=f)
            print(file=f)
    elif x==1:
#         display_individual_matches(matching_tables,matching_tables_dict)
        print("Only one match found!! Rank 1 : ",matching_tables[0])
    elif x==2:
        ctr=0
        a=matching_tables[0]+' : '+matching_tables[1]
        b=matching_tables[1]+' : '+matching_tables[0]
        if (a in cat_sim):
            if (cat_sim[a]>.50 and col_sim[a]>0.50):
                res=merge_list_knowledge_graph([matching_tables[0],matching_tables[1]],matching_tables_dict)
        elif(b in cat_sim):
            if (cat_sim[b]>.50 and col_sim[b]>0.50):
                res=merge_list_knowledge_graph([matching_tables[0],matching_tables[1]],matching_tables_dict)
        else:
            continue
        op_string=op_str1+'1'+op_str2
        res.to_csv(op_string,sep=',', index=False)
        cols=res.columns
        cov,comp=coverage_and_completeness(res)
        with open("output_folder_transformation/results.txt",'a') as f:
            print(op_string,file=f)
            if transformations!=-1:
                l=get_possible_transformations(cols)
                print("possible transformations are : ",l,file=f)
            if stats!=-1:
                sat,unsat=check_stats(res)
                if len(sat)>0:
                    print("Conditions Satisfied : ",sat,file=f)
                if len(unsat)>0:
                    print("Conditions Not Satisfied : ",unsat,file=f)
            print('Missing Values(NANs): ',nan_score(table=res),file=f)
            print("Columns that match with input_schema: ",file=f)
            for j in matching_tables:
                print(j+' : ',matching_tables_dict[j],file=f)
            print("Coverage Score : ",cov,"\t Completeness Score : ",comp,"\t Number of Rows : ",len(res),file=f)
            print(file=f)
            display(res)
            no_of_rows["output_"+str(ctr)+op_str2]=len(res)
            if comp in comp_score:
                comp_score[comp].append("output_"+str(ctr)+op_str2)
            else:
                comp_score[comp]=[]
                comp_score[comp].append("output_"+str(ctr)+op_str2)
#         display_individual_matches(matching_tables,matching_tables_dict)
    else:
        ctr=0
        count_comb=len(matching_tables)
        all_combos=generate_all_combinations(matching_tables)
        for i in range(count_comb,1,-1):
            for l in all_combos[i]:
                a=list(l)
                comb=list(combinations(a,2))
                flag=0
                for pair in comb:
                    if flag==0:
                        t1,t2=pair
                        if ((t1+' : '+t2) in cat_sim):
                            if (cat_sim[t1+' : '+t2]>=.50 and col_sim[t1+' : '+t2]>=0.50):
                                pass
                            else:
                                flag=1
                        else:
                            flag=1
                    else:
                        break
                if flag==0:
                    if kg==1:
                        res=merge_list_knowledge_graph(a,matching_tables_dict)
                    else:
                        res=merge_list(a,matching_tables_dict)
                    if type(res) != type(-1):
                        ctr+=1
                        op_string=op_str1+str(ctr)+op_str2
                        res.to_csv(op_string,sep=',', index=False)
                        cols=res.columns
                        cov,comp=coverage_and_completeness(res)
                        with open("output_folder_transformation/results.txt",'a') as f:
                            print(str(ctr)+op_str2+' : ',end='',file=f)
                            print(str(ctr)+op_str2+' : ',end='')
                            for j in a:
                                print(j+'('+all_tablenames[j]+')'+'\t\t',end='',file=f)
                                print(j+'('+all_tablenames[j]+')'+'\t\t',end='')
                            print(file=f)
                            print()
                            if transformations!=-1:
                                l=get_possible_transformations(cols)
                                print("possible transformations are : ",l,file=f)
                                print("possible transformations are : ",l)
                            if stats!=-1:
                                sat,unsat=check_stats(res)
                                if len(sat)>0:
                                    print("Conditions Satisfied : ",sat,file=f)
                                    print("Conditions Satisfied : ",sat)
                                if len(unsat)>0:
                                    print("Conditions Not Satisfied : ",unsat,file=f)
                                    print("Conditions Not Satisfied : ",unsat)
                            print('Missing Values(NANs): ',nan_score(table=res),file=f)
                            print('Missing Values(NANs): ',nan_score(table=res))
                            print("Columns that match with input_schema: ",file=f)
                            print("Columns that match with input_schema: ")
                            for j in a:
                                print(j+' : ',matching_tables_dict[j],file=f)
                                print(j+' : ',matching_tables_dict[j])
                            print("Coverage Score : ",cov,"\t Completeness Score : ",comp,file=f)
                            print("Coverage Score : ",cov,"\t Completeness Score : ",comp)
                            display(res)
                            print()
                            print(file=f)
                            no_of_rows["output_"+str(ctr)+op_str2]=len(res)
                            if comp in comp_score:
                                comp_score[comp].append("output_"+str(ctr)+op_str2)
                            else:
                                comp_score[comp]=[]
                                comp_score[comp].append("output_"+str(ctr)+op_str2)
#         display_individual_matches(matching_tables,matching_tables_dict)
    with open("output_folder_transformation/results.txt",'a') as f:
        print('******************************',file=f)
    ranking_display(comp_score,no_of_rows)

SyntaxError: 'continue' not properly in loop (cell_name, line 60)

In [24]:
s='{"schema": {"round": "int64", "round 1": "object", "circuit": "object", "day": "object", "pole position": "object", "fastest lap": "object", "driver": "object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
direct_matches_only()
ranking_display(comp_score,no_of_rows)

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'day': 'object', 'pole position': 'object', 'fastest lap': 'object', 'driver': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

Matching Tables 
203_514.csv  :  2008 Superleague Formula season
204_253.csv  :  1990 Superbike World Championship season
204_40.csv  :  2008 Superbike World Championship season
204_569.csv  :  1998 Swedish Touring Car Championship season
204_845.csv  :  2003 Barber Dodge Pro Series season


203_514.csv(2008 Superleague Formula season)
Missing Values(NANs score):  {'round': '0/12', 'round 1': '0/12', 'race': '0/12', 'date': '0/12', 'pole position': '6/12', 'fastest lap': '0/12', 'winning club': '0/12', 'winning team': '0/12', 'report': '0/12'}
Columns that match with input_schema:
 203_514.csv :  {'round': 'round', 'round 1': 'round 1', 'pole position': 'pole position', 'fastest lap': 'fastest lap'}
Coverage Score :  0.57142857142

In [53]:
s='{"schema": {"round": "int64", "round 1": "object", "circuit": "object", "day": "object", "pole position": "object", "fastest lap": "object", "driver": "object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
check_possible_extended_matches()
ranking_display(comp_score,no_of_rows)

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'day': 'object', 'pole position': 'object', 'fastest lap': 'object', 'driver': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

Matching Tables
203_181.csv  :  1990 IndyCar season
203_408.csv  :  1989 Formula One season
203_514.csv  :  2008 Superleague Formula season
203_742.csv  :  1995 IndyCar season
204_253.csv  :  1990 Superbike World Championship season
204_40.csv  :  2008 Superbike World Championship season
204_455.csv  :  1989 Formula One season
204_569.csv  :  1998 Swedish Touring Car Championship season
204_63.csv  :  2002 Italian Formula Three season
204_845.csv  :  2003 Barber Dodge Pro Series season


203_181.csv(1990 IndyCar season)
Missing Values(NANs score):  {'date': '0/17', 'rnd': '0/17', 'race name': '0/17', 'circuit': '0/17', 'city/location': '0/17', 'pole position': '0/17', 'winning driver': '0/17', 'winning team': '0/17', 'report': '0/

Unnamed: 0,day,rnd,race name,circuit,city/location,pole position,driver,winning team,report
0,1,April 8,Autoworks 200,Phoenix International Raceway,"Phoenix, Arizona",Rick Mears,Rick Mears,Team Penske,Report
1,2,April 22,Toyota Long Beach Grand Prix,Streets of Long Beach,"Long Beach, California","Al Unser, Jr.","Al Unser, Jr.",Galles-Kraco Racing,Report
2,3,May 27,74th Indianapolis 500,Indianapolis Motor Speedway,"Speedway, Indiana",Emerson Fittipaldi,Arie Luyendyk,Doug Shierson Racing,Report
3,4,June 3,Miller Genuine Draft 200,Milwaukee Mile,"West Allis, Wisconsin",Rick Mears,"Al Unser, Jr.",Galles-Kraco Racing,Report
4,5,June 17,Valvoline Grand Prix of Detroit,Streets of Detroit,"Detroit, Michigan",Michael Andretti,Michael Andretti,Newman/Haas Racing,Report
5,6,June 24,Budweiser/G.I.Joe's 200,Portland International Raceway,"Portland, Oregon",Danny Sullivan,Michael Andretti,Newman/Haas Racing,Report
6,7,July 8,Budweiser Grand Prix of Cleveland,Cleveland Burke Lakefront Airport,"Cleveland, Ohio",Rick Mears,Danny Sullivan,Team Penske,Report
7,8,July 15,Marlboro Grand Prix at the Meadowlands,Meadowlands Sports Complex,"East Rutherford, New Jersey",Michael Andretti,Michael Andretti,Newman/Haas Racing,Report
8,9,July 22,Molson Indy Toronto,Exhibition Place,"Toronto, Ontario",Danny Sullivan,"Al Unser, Jr.",Galles-Kraco Racing,Report
9,10,August 5,Marlboro 500,Michigan International Speedway,"Brooklyn, Michigan",Emerson Fittipaldi,"Al Unser, Jr.",Galles-Kraco Racing,Report


203_408.csv(1989 Formula One season)
Missing Values(NANs score):  {'rd ': '0/16', 'grand prix': '0/16', 'date': '0/16', 'location': '0/16', 'pole position': '0/16', 'fastest lap': '0/16', 'winning driver': '0/16', 'constructor': '0/16', 'report': '0/16'}
Columns that match with input_schema:
 203_408.csv :  {'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day', 'winning driver': 'driver'}
Coverage Score :  0.5714285714285714 	 Completeness Score :  0.5714285714285714



Unnamed: 0,rd,grand prix,day,location,pole position,fastest lap,driver,constructor,report
0,1,Brazilian Grand Prix,26 March,Jacarepaguá,Ayrton Senna,Riccardo Patrese,Nigel Mansell,Ferrari,Report
1,2,San Marino Grand Prix,23 April,Imola,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
2,3,Monaco Grand Prix,7 May,Monaco,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
3,4,Mexican Grand Prix,28 May,Hermanos Rodriguez,Ayrton Senna,Nigel Mansell,Ayrton Senna,McLaren-Honda,Report
4,5,United States Grand Prix,4 June,Phoenix,Ayrton Senna,Ayrton Senna,Alain Prost,McLaren-Honda,Report
5,6,Canadian Grand Prix,18 June,Circuit Gilles Villeneuve,Alain Prost,Jonathan Palmer,Thierry Boutsen,Williams-Renault,Report
6,7,French Grand Prix,9 July,Paul Ricard,Alain Prost,Mauricio Gugelmin,Alain Prost,McLaren-Honda,Report
7,8,British Grand Prix,16 July,Silverstone,Ayrton Senna,Nigel Mansell,Alain Prost,McLaren-Honda,Report
8,9,German Grand Prix,30 July,Hockenheimring,Ayrton Senna,Ayrton Senna,Ayrton Senna,McLaren-Honda,Report
9,10,Hungarian Grand Prix,13 August,Hungaroring,Riccardo Patrese,Nigel Mansell,Nigel Mansell,Ferrari,Report


203_514.csv(2008 Superleague Formula season)
Missing Values(NANs score):  {'round': '0/12', 'round 1': '0/12', 'race': '0/12', 'date': '0/12', 'pole position': '6/12', 'fastest lap': '0/12', 'winning club': '0/12', 'winning team': '0/12', 'report': '0/12'}
Columns that match with input_schema:
 203_514.csv :  {'round': 'round', 'round 1': 'round 1', 'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day'}
Coverage Score :  0.7142857142857143 	 Completeness Score :  0.6428571428571429



Unnamed: 0,round,round 1,race,day,pole position,fastest lap,winning club,winning team,report
0,1,R1,Donington Park,August 31,Beijing Guoan,Beijing Guoan,Beijing Guoan,Zakspeed,Report
1,1,R2,Donington Park,August 31,,PSV Eindhoven,Sevilla FC,GTA Motor Competicion,Report
2,2,R1,Nurburgring,September 21,A.C. Milan,PSV Eindhoven,A.C. Milan,Scuderia Playteam,Report
3,2,R2,Nurburgring,September 21,,SC Corinthians,PSV Eindhoven,Azerti Motorsport,Report
4,3,R1,Zolder,October 5,Borussia Dortmund,Liverpool F.C.,Liverpool F.C.,Hitech Junior Team,Report
5,3,R2,Zolder,October 5,,Atletico Madrid,Beijing Guoan,Zakspeed,Report
6,4,R1,Estoril,October 19,A.S. Roma,Atletico Madrid,Liverpool F.C.,Hitech Junior Team,Report
7,4,R2,Estoril,October 19,,Borussia Dortmund,Al Ain,Azerti Motorsport,Report
8,5,R1,Vallelunga,November 2,Liverpool F.C.,Beijing Guoan,Beijing Guoan,Zakspeed,Report
9,5,R2,Vallelunga,November 2,,Atletico Madrid,F.C. Porto,Hitech Junior Team,Report


203_742.csv(1995 IndyCar season)
Missing Values(NANs score):  {'rnd': '0/17', 'date': '0/17', 'race name': '0/17', 'circuit': '0/17', 'city/location': '0/17', 'pole position': '0/17', 'fastest lap': '0/17', 'winning driver': '0/17', 'winning team': '0/17', 'report': '0/17'}
Columns that match with input_schema:
 203_742.csv :  {'circuit': 'circuit', 'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day', 'winning driver': 'driver'}
Coverage Score :  0.7142857142857143 	 Completeness Score :  0.7142857142857143



Unnamed: 0,rnd,day,race name,circuit,city/location,pole position,fastest lap,driver,winning team,report
0,1,March 5,Marlboro Grand Prix of Miami Presented by Toyota,Bicentennial Park,"Miami, Florida",Michael Andretti,Scott Pruett,Jacques Villeneuve,Team Green,Report
1,2,March 19,Australian IndyCar Grand Prix,Surfers Paradise Street Circuit,"Surfers Paradise, Australia",Michael Andretti,Michael Andretti,Paul Tracy,Newman/Haas Racing,Report
2,3,April 2,Slick 50 200,Phoenix International Raceway,"Phoenix, Arizona",Bryan Herta,Emerson Fittipaldi,Robby Gordon,Walker Racing,Report
3,4,April 9,Toyota Grand Prix of Long Beach,Streets of Long Beach,"Long Beach, California",Michael Andretti,Michael Andretti,"Al Unser, Jr.",Marlboro Team Penske,Report
4,5,April 23,Bosch Spark Plug Grand Prix,Nazareth Speedway,"Nazareth, Pennsylvania",Robby Gordon,Emerson Fittipaldi,Emerson Fittipaldi,Marlboro Team Penske,Report
5,6,May 28,79th Indianapolis 500*,Indianapolis Motor Speedway,"Speedway, Indiana",Scott Brayton,Scott Goodyear,Jacques Villeneuve,Team Green,Report
6,7,June 4,Miller Genuine Draft 200,Milwaukee Mile,"West Allis, Wisconsin",Teo Fabi,Teo Fabi,Paul Tracy,Newman/Haas Racing,Report
7,8,June 11,ITT Automotive Grand Prix of Detroit,The Raceway on Belle Isle Park,"Detroit, Michigan",Robby Gordon,Michael Andretti,Robby Gordon,Walker Racing,Report
8,9,June 25,Budweiser/G. I. Joe's 200,Portland International Raceway,"Portland, Oregon",Jacques Villeneuve,"Al Unser, Jr.","Al Unser, Jr.",Marlboro Team Penske,Report
9,10,July 9,Texaco/Havoline 200,Road America,"Elkhart Lake, Wisconsin",Jacques Villeneuve,Jacques Villeneuve,Jacques Villeneuve,Team Green,Report


204_253.csv(1990 Superbike World Championship season)
Missing Values(NANs score):  {'round': '0/26', 'round 1': '0/26', 'circuit': '0/26', 'date': '0/26', 'pole position': '0/26', 'fastest lap': '0/26', 'winning rider': '0/26'}
Columns that match with input_schema:
 204_253.csv :  {'round': 'round', 'round 1': 'round 1', 'circuit': 'circuit', 'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day'}
Coverage Score :  0.8571428571428571 	 Completeness Score :  0.8571428571428571



Unnamed: 0,round,round 1,circuit,day,pole position,fastest lap,winning rider
0,1,R1,Jerez,18 March,Raymond Roche,Stephane Mertens,Raymond Roche
1,1,R2,Jerez,18 March,Raymond Roche,Raymond Roche,Raymond Roche
2,2,R1,Donington,16 April,Giancarlo Falappa,Rob Phillis,Fred Merkel
3,2,R2,Donington,16 April,Giancarlo Falappa,Raymond Roche,Giancarlo Falappa
4,3,R1,Hungaroring,30 April,Malcolm Campbell,Raymond Roche,Fred Merkel
5,3,R2,Hungaroring,30 April,Malcolm Campbell,Fred Merkel,Raymond Roche
6,4,R1,Hockenheim,6 May,Raymond Roche,Fred Merkel,Fred Merkel
7,4,R2,Hockenheim,6 May,Raymond Roche,Raymond Roche,Stephane Mertens
8,5,R1,Mosport,3 June,Giancarlo Falappa,Raymond Roche,Raymond Roche
9,5,R2,Mosport,3 June,Giancarlo Falappa,Jamie James,Raymond Roche


204_40.csv(2008 Superbike World Championship season)
Missing Values(NANs score):  {'round': '0/28', 'round 1': '0/28', 'country': '0/28', 'circuit': '0/28', 'date': '0/28', 'pole position': '0/28', 'fastest lap': '0/28', 'winning rider': '0/28', 'winning team': '0/28', 'report': '0/28'}
Columns that match with input_schema:
 204_40.csv :  {'round': 'round', 'round 1': 'round 1', 'circuit': 'circuit', 'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day'}
Coverage Score :  0.8571428571428571 	 Completeness Score :  0.8571428571428571



Unnamed: 0,round,round 1,country,circuit,day,pole position,fastest lap,winning rider,winning team,report
0,1,R1,Qatar,Losail,23 February,Troy Corser,Noriyuki Haga,Troy Bayliss,Xerox Ducati,Report
1,1,R2,Qatar,Losail,23 February,Troy Corser,Fonsi Nieto,Fonsi Nieto,Alstare Suzuki,Report
2,2,R1,Australia,Phillip Island,2 March,Troy Bayliss,Troy Bayliss,Troy Bayliss,Xerox Ducati,Report
3,2,R2,Australia,Phillip Island,2 March,Troy Bayliss,Max Biaggi,Troy Bayliss,Xerox Ducati,Report
4,3,R1,Spain,Valencia,6 April,Max Neukirchner,Noriyuki Haga,Lorenzo Lanzi,Team R.G,Report
5,3,R2,Spain,Valencia,6 April,Max Neukirchner,Carlos Checa,Noriyuki Haga,Yamaha Motor Italia,Report
6,4,R1,Netherlands,Assen,27 April,Troy Bayliss,Max Neukirchner,Troy Bayliss,Xerox Ducati,Report
7,4,R2,Netherlands,Assen,27 April,Troy Bayliss,Troy Bayliss,Troy Bayliss,Xerox Ducati,Report
8,5,R1,Italy,Monza,11 May,Troy Bayliss,Noriyuki Haga,Max Neukirchner,Alstare Suzuki,Report
9,5,R2,Italy,Monza,11 May,Troy Bayliss,Noriyuki Haga,Noriyuki Haga,Yamaha Motor Italia,Report


204_455.csv(1989 Formula One season)
Missing Values(NANs score):  {'rd ': '0/16', 'grand prix': '0/16', 'date': '0/16', 'location': '0/16', 'pole position': '0/16', 'fastest lap': '0/16', 'winning driver': '0/16', 'constructor': '0/16', 'report': '0/16'}
Columns that match with input_schema:
 204_455.csv :  {'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day', 'winning driver': 'driver'}
Coverage Score :  0.5714285714285714 	 Completeness Score :  0.5714285714285714



Unnamed: 0,rd,grand prix,day,location,pole position,fastest lap,driver,constructor,report
0,1,Brazilian Grand Prix,26 March,Jacarepaguá,Ayrton Senna,Riccardo Patrese,Nigel Mansell,Ferrari,Report
1,2,San Marino Grand Prix,23 April,Imola,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
2,3,Monaco Grand Prix,7 May,Monaco,Ayrton Senna,Alain Prost,Ayrton Senna,McLaren-Honda,Report
3,4,Mexican Grand Prix,28 May,Hermanos Rodriguez,Ayrton Senna,Nigel Mansell,Ayrton Senna,McLaren-Honda,Report
4,5,United States Grand Prix,4 June,Phoenix,Ayrton Senna,Ayrton Senna,Alain Prost,McLaren-Honda,Report
5,6,Canadian Grand Prix,18 June,Circuit Gilles Villeneuve,Alain Prost,Jonathan Palmer,Thierry Boutsen,Williams-Renault,Report
6,7,French Grand Prix,9 July,Paul Ricard,Alain Prost,Mauricio Gugelmin,Alain Prost,McLaren-Honda,Report
7,8,British Grand Prix,16 July,Silverstone,Ayrton Senna,Nigel Mansell,Alain Prost,McLaren-Honda,Report
8,9,German Grand Prix,30 July,Hockenheimring,Ayrton Senna,Ayrton Senna,Ayrton Senna,McLaren-Honda,Report
9,10,Hungarian Grand Prix,13 August,Hungaroring,Riccardo Patrese,Nigel Mansell,Nigel Mansell,Ferrari,Report


204_569.csv(1998 Swedish Touring Car Championship season)
Missing Values(NANs score):  {'round': '0/12', 'round 1': '0/12', 'circuit': '0/12', 'date': '0/12', 'pole position': '6/12', 'fastest lap': '0/12', 'winning driver': '0/12', 'winning team': '0/12', 'winning privateer': '0/12'}
Columns that match with input_schema:
 204_569.csv :  {'round': 'round', 'round 1': 'round 1', 'circuit': 'circuit', 'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day', 'winning driver': 'driver'}
Coverage Score :  1.0 	 Completeness Score :  0.9285714285714286



Unnamed: 0,round,round 1,circuit,day,pole position,fastest lap,driver,winning team,winning privateer
0,1,R1,Mantorp Park,10 May,Mattias Ekström,Fredrik Ekblom,Mats Linden,Kristoffersson Motorsport,Richard Göransson
1,1,R2,Mantorp Park,10 May,,Peggen Andersson,Fredrik Ekblom,BMW Dealer Team,Pontus Mörth
2,2,R3,Karlskoga-Gelleråsen,31 May,Jan Nilsson,Jan Nilsson,Jan Nilsson,Flash Engineering,Pontus Mörth
3,2,R4,Karlskoga-Gelleråsen,31 May,,Fredrik Ekblom,Peggen Andersson,BMW Dealer Team,Pontus Mörth
4,3,R5,Anderstorp,28 June,Mattias Ekström,Fredrik Ekblom,Jan Nilsson,Flash Engineering,Pontus Mörth
5,3,R6,Anderstorp,28 June,,Peggen Andersson,Peggen Andersson,BMW Dealer Team,Georg Bakajev
6,4,R7,Falkenberg,9 July,Jens Edman,Jens Edman,Jan Nilsson,Flash Engineering,Pontus Mörth
7,4,R8,Falkenberg,9 July,,Mattias Ekström,Jens Edman,Flash Engineering,Pontus Mörth
8,5,R9,Ring Knutstorp,6 September,Fredrik Ekblom,Fredrik Ekblom,Fredrik Ekblom,BMW Dealer Team,Pontus Mörth
9,5,R10,Ring Knutstorp,6 September,,Mats Linden,Jan Nilsson,Flash Engineering,Georg Bakajev


204_63.csv(2002 Italian Formula Three season)
Missing Values(NANs score):  {'round': '0/9', 'circuit': '0/9', 'date': '0/9', 'pole position': '0/9', 'winning driver': '0/9', 'winning team': '0/9', 'trophy winner': '0/9'}
Columns that match with input_schema:
 204_63.csv :  {'round': 'round', 'circuit': 'circuit', 'pole position': 'pole position', 'date': 'day', 'winning driver': 'driver'}
Coverage Score :  0.7142857142857143 	 Completeness Score :  0.7142857142857143



Unnamed: 0,round,circuit,day,pole position,driver,winning team,trophy winner
0,1,"ACI Vallelunga Circuit, Campagnano di Roma",7 April,Miloš Pavlović,Miloš Pavlović,Target Racing,Nino Famà
1,2,Misano World Circuit,28 April,Miloš Pavlović,Miloš Pavlović,Target Racing,Giovanni Faraonio
2,3,"Autodromo di Pergusa, Enna",19 May,Miloš Pavlović,Philip Cloostermans,Azeta Racing,Carmine Tancredi
3,4,Autodromo Nazionale Monza,30 June,Philip Cloostermans,Philip Cloostermans,Azeta Racing,Carmine Tancredi
4,5,"Autodromo Riccardo Paletti, Varano",21 July,Miloš Pavlović,Miloš Pavlović,Target Racing,Carmine Tancredi
5,6,"Autodromo Enzo e Dino Ferrari, Imola",1 September,Vitantonio Liuzzi,Vitantonio Liuzzi,Bertram Schäfer Racing,Silvio Alberti
6,7,"Autodromo del Levante, Binetto",8 September,Christiano Citron,Miloš Pavlović,Target Racing,Carmine Tancredi
7,8,"Mugello Circuit, Scarperia",6 October,Miloš Pavlović,Miloš Pavlović,Target Racing,Dino Lusuardi
8,9,"Autodromo dell'Umbria, Magione",20 October,Andreas Zuber,Christiano Citron,Target Racing,Alberto Morelli


204_845.csv(2003 Barber Dodge Pro Series season)
Missing Values(NANs score):  {'round': '0/10', 'circuit': '0/10', 'location': '0/10', 'date': '0/10', 'pole position': '0/10', 'fastest lap': '0/10', 'winning driver': '0/10', 'headline event': '0/10'}
Columns that match with input_schema:
 204_845.csv :  {'round': 'round', 'circuit': 'circuit', 'pole position': 'pole position', 'fastest lap': 'fastest lap', 'date': 'day', 'winning driver': 'driver'}
Coverage Score :  0.8571428571428571 	 Completeness Score :  0.8571428571428571



Unnamed: 0,round,circuit,location,day,pole position,fastest lap,driver,headline event
0,1,Albert Whitted Airport,"St. Petersburg, Florida",February 23,Leonardo Maia,Dan Di Leo,Leonardo Maia,Grand Prix of St. Petersburg
1,2,Fundidora park,Monterrey,March 23,Leonardo Maia,David Martinez,David Martinez,Monterrey Grand Prix
2,3,Milwaukee Mile,"West Allis, Wisconsin",June 1,"Victor Gonzalez, Jr.",Scott Poirier,Leonardo Maia,Milwaukee Mile Centennial 250
3,4,Mazda Raceway Laguna Seca,"Monterey, California",June 15,Memo Rojas,Leonardo Maia,Dan Di Leo,Grand Prix of Monterey
4,5,Portland International Raceway,"Portland, Oregon",June 22,Leonardo Maia,Leonardo Maia,Leonardo Maia,G.I. Joe's 200
5,6,Burke Lakefront Airport,"Cleveland, Ohio",July 5,Leonardo Maia,Leonardo Maia,Leonardo Maia,U.S. Bank Cleveland Grand Prix
6,7,Exhibition Place,Toronto,July 13,Leonardo Maia,Leonardo Maia,Memo Rojas,Molson Indy Toronto
7,8,Concord Pacific Place,Vancouver,July 27,Leonardo Maia,David Martinez,Leonardo Maia,Molson Indy Vancouver
8,9,Mid-Ohio Sports Car Course,"Lexington, Ohio",August 3,Leonardo Maia,Leonardo Maia,Leonardo Maia,Champ Car Grand Prix of Mid-Ohio
9,10,Circuit Gilles Villeneuve,Montreal,August 24,Memo Rojas,Leonardo Maia,Memo Rojas,Molson Indy Montreal



Rank  1   :  204_569.csv          completeness score : 0.9285714285714286 		number of rows:  12
Rank  2   :  204_40.csv           completeness score : 0.8571428571428571 		number of rows:  28
Rank  3   :  204_253.csv          completeness score : 0.8571428571428571 		number of rows:  26
Rank  4   :  204_845.csv          completeness score : 0.8571428571428571 		number of rows:  10
Rank  5   :  203_742.csv          completeness score : 0.7142857142857143 		number of rows:  17
Rank  6   :  204_63.csv           completeness score : 0.7142857142857143 		number of rows:  9
Rank  7   :  203_514.csv          completeness score : 0.6428571428571429 		number of rows:  12
Rank  8   :  203_181.csv          completeness score : 0.5714285714285714 		number of rows:  17
Rank  9   :  203_408.csv          completeness score : 0.5714285714285714 		number of rows:  16
Rank  10  :  204_455.csv          completeness score : 0.5714285714285714 		number of rows:  16

Ranking Complete!!



In [26]:
s='{"schema": {"round": "int64", "round 1": "object", "circuit": "object", "day": "object", "pole position": "object", "fastest lap": "object", "driver": "object"},"categories":["motorsport", "car", "seasons"]}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
check_possible_category_included_matches()
ranking_display(comp_score,no_of_rows)

Input Schema :  {'round': 'int64', 'round 1': 'object', 'circuit': 'object', 'day': 'object', 'pole position': 'object', 'fastest lap': 'object', 'driver': 'object'}
Input Categories :  ['motorsport', 'car', 'seasons']
Input Transformations : None
Input Condition based conditions : None

Matching Tables
203_181.csv  :  1990 IndyCar season
203_514.csv  :  2008 Superleague Formula season
203_742.csv  :  1995 IndyCar season
204_253.csv  :  1990 Superbike World Championship season
204_569.csv  :  1998 Swedish Touring Car Championship season
204_63.csv  :  2002 Italian Formula Three season


203_181.csv(1990 IndyCar season)
Missing Values(NANs score):  {'date': '0/17', 'rnd': '0/17', 'race name': '0/17', 'circuit': '0/17', 'city/location': '0/17', 'pole position': '0/17', 'winning driver': '0/17', 'winning team': '0/17', 'report': '0/17'}
Columns that match with input_schema:
 203_181.csv :  {'circuit': 'circuit', 'pole position': 'pole position', 'date': 'day', 'winning driver': 'driver'}


In [45]:
# schema, transformations and content based conditions
s='{"schema": {"nationality": "object", "tonnage (grt)": "int64", "fate": "object","ship":"object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
get_matches()

Input Schema :  {'nationality': 'object', 'tonnage (grt)': 'int64', 'fate': 'object', 'ship': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema
Matching Tables
202_117.csv  :  German submarine U-559
203_148.csv  :  German submarine U-9 (1935)
203_268.csv  :  German submarine U-502
204_100.csv  :  Hans-Rudolf Rosing


202_117.csv(German submarine U-559)
Missing Values(NANs score):  {'date': '0/6', 'ship': '0/6', 'nationality': '0/6', 'tonnage': '0/6', 'fate': '0/6'}
Columns that match with input_schema:
 202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk
3,26 December 1941,SS Warszawa,Poland,2487,Sunk
4,10 June 1942,MV Athene,Norway,4681,Sunk
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged


203_148.csv(German submarine U-9 (1935))
Missing Values(NANs score):  {'date': '0/9', 'name': '0/9', 'nationality': '0/9', 'tonnage (grt)': '0/9', 'fate': '0/9'}
Columns that match with input_schema:
 203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  0.75 	 Completeness Score :  0.75



Unnamed: 0,date,name,nationality,tonnage (grt),fate
0,18 January 1940,Flandria,Sweden,1179,Sunk
1,19 January 1940,Patria,Sweden,1188,Sunk
2,11 February 1940,Linda,Estonia,1213,Sunk
3,4 May 1940,San Tiburcio,United Kingdom,5995,Sunk (mine)
4,9 May 1940,Doris,French Navy,552,Sunk
5,11 May 1940,Tringa,United Kingdom,1930,Sunk
6,11 May 1940,Viiu,Estonia,1908,Sunk
7,23 May 1940,Sigurd Faulbaum,Belgium,3256,Sunk
8,11 May 1944,Shtorm,Soviet Union,412,Damaged


203_268.csv(German submarine U-502)
Missing Values(NANs score):  {'date': '0/16', 'name': '0/16', 'nationality': '0/16', 'tonnage (grt)': '0/16', 'fate': '0/16'}
Columns that match with input_schema:
 203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  0.75 	 Completeness Score :  0.75



Unnamed: 0,date,name,nationality,tonnage (grt),fate
0,7 October 1941,Svend Foyn,United Kingdom,14795,Damaged
1,16 February 1942,Monagas,Venezuela,2650,Sunk
2,16 February 1942,San Nicholas,United Kingdom,2391,Sunk
3,16 February 1942,Tia Juana,United Kingdom,2395,Sunk
4,22 February 1942,J.N.Pew,United States,9033,Sunk
5,23 February 1942,Sun,United States,9002,Damaged
6,23 February 1942,Thalia,Panama,8329,Sunk
7,11 May 1942,Cape of Good Hope,United Kingdom,4963,Sunk
8,24 May 1942,Gonçalves Dias,Brazil,4996,Sunk
9,28 May 1942,Alcoa Pilgrim,United States,6759,Sunk


204_100.csv(Hans-Rudolf Rosing)
Missing Values(NANs score):  {'date': '0/13', 'name of ship': '0/13', 'nationality': '0/13', 'tonnage': '0/13', 'fate': '0/13'}
Columns that match with input_schema:
 204_100.csv :  {'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)', 'name of ship': 'ship'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,5 June 1940,SS Stancor,United Kingdom,798,Sunk at 58°48′N 08°45′W﻿ / ﻿58.800°N 8.750°W
1,7 June 1940,SS Frances Massey,United Kingdom,4212,Sunk at 55°33′N 08°26′W﻿ / ﻿55.550°N 8.433°W
2,7 June 1940,SS Eros,United Kingdom,5888,Damaged at 55°33′N 08°26′W﻿ / ﻿55.550°N 8.433°W
3,11 June 1940,SS Violando N Goulandris,Greece,2375,Sunk at 44°04′N 12°30′W﻿ / ﻿44.067°N 12.500°W
4,19 June 1940,MV Tudor,Norway,6607,Sunk at 45°10′N 11°50′W﻿ / ﻿45.167°N 11.833°W
5,19 June 1940,SS Baron Loudoun,United Kingdom,3164,Sunk at 45°00′N 11°21′W﻿ / ﻿45.000°N 11.350°W
6,19 June 1940,SS British Monarch,United Kingdom,5661,Sunk at 45°00′N 11°21′W﻿ / ﻿45.000°N 11.350°W
7,20 June 1940,MV Moerdrecht,Netherlands,7493,Sunk at 43°34′N 14°20′W﻿ / ﻿43.567°N 14.333°W
8,16 August 1940,SS Hedrun,Sweden,2325,Sunk at 57°10′N 16°37′W﻿ / ﻿57.167°N 16.617°W
9,19 August 1940,SS Ville de Gand,Belgium,7590,Sunk at 55°28′N 15°10′W﻿ / ﻿55.467°N 15.167°W



1.csv : 202_117.csv		203_148.csv		203_268.csv		
Missing Values(NANs):  {'date': '0/31', 'ship': '25/31', 'nationality': '0/31', 'tonnage (grt)': '0/31', 'fate': '0/31', 'name': '6/31'}
Columns that match with input_schema: 
202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  0.7983870967741935


Unnamed: 0,date,ship,nationality,tonnage (grt),fate,name
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk,
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk,
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk,
3,26 December 1941,SS Warszawa,Poland,2487,Sunk,
4,10 June 1942,MV Athene,Norway,4681,Sunk,
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged,
6,18 January 1940,,Sweden,1179,Sunk,Flandria
7,19 January 1940,,Sweden,1188,Sunk,Patria
8,11 February 1940,,Estonia,1213,Sunk,Linda
9,4 May 1940,,United Kingdom,5995,Sunk (mine),San Tiburcio



2.csv : 202_117.csv		203_148.csv		
Missing Values(NANs):  {'date': '0/15', 'ship': '9/15', 'nationality': '0/15', 'tonnage (grt)': '0/15', 'fate': '0/15'}
Columns that match with input_schema: 
202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  0.85


Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk
3,26 December 1941,SS Warszawa,Poland,2487,Sunk
4,10 June 1942,MV Athene,Norway,4681,Sunk
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged
6,18 January 1940,,Sweden,1179,Sunk
7,19 January 1940,,Sweden,1188,Sunk
8,11 February 1940,,Estonia,1213,Sunk
9,4 May 1940,,United Kingdom,5995,Sunk (mine)



3.csv : 202_117.csv		203_268.csv		
Missing Values(NANs):  {'date': '0/22', 'ship': '16/22', 'nationality': '0/22', 'tonnage (grt)': '0/22', 'fate': '0/22', 'name': '6/22'}
Columns that match with input_schema: 
202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  0.8181818181818181


Unnamed: 0,date,ship,nationality,tonnage (grt),fate,name
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk,
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk,
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk,
3,26 December 1941,SS Warszawa,Poland,2487,Sunk,
4,10 June 1942,MV Athene,Norway,4681,Sunk,
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged,
6,7 October 1941,,United Kingdom,14795,Damaged,Svend Foyn
7,16 February 1942,,Venezuela,2650,Sunk,Monagas
8,16 February 1942,,United Kingdom,2391,Sunk,San Nicholas
9,16 February 1942,,United Kingdom,2395,Sunk,Tia Juana



4.csv : 203_148.csv		203_268.csv		
Missing Values(NANs):  {'date': '0/25', 'name': '0/25', 'nationality': '0/25', 'tonnage (grt)': '0/25', 'fate': '0/25'}
Columns that match with input_schema: 
203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  0.75 	 Completeness Score :  0.75


Unnamed: 0,date,name,nationality,tonnage (grt),fate
0,18 January 1940,Flandria,Sweden,1179,Sunk
1,19 January 1940,Patria,Sweden,1188,Sunk
2,11 February 1940,Linda,Estonia,1213,Sunk
3,4 May 1940,San Tiburcio,United Kingdom,5995,Sunk (mine)
4,9 May 1940,Doris,French Navy,552,Sunk
5,11 May 1940,Tringa,United Kingdom,1930,Sunk
6,11 May 1940,Viiu,Estonia,1908,Sunk
7,23 May 1940,Sigurd Faulbaum,Belgium,3256,Sunk
8,11 May 1944,Shtorm,Soviet Union,412,Damaged
9,7 October 1941,Svend Foyn,United Kingdom,14795,Damaged




Rank  1   :  204_100.csv          completeness score : 1.0000000000000000 		number of rows:  13
Rank  2   :  202_117.csv          completeness score : 1.0000000000000000 		number of rows:  6
Rank  3   :  output_2.csv         completeness score : 0.8500000000000000 		number of rows:  15
Rank  4   :  output_3.csv         completeness score : 0.8181818181818181 		number of rows:  22
Rank  5   :  output_1.csv         completeness score : 0.7983870967741935 		number of rows:  31
Rank  6   :  output_4.csv         completeness score : 0.7500000000000000 		number of rows:  25
Rank  7   :  203_268.csv          completeness score : 0.7500000000000000 		number of rows:  16
Rank  8   :  203_148.csv          completeness score : 0.7500000000000000 		number of rows:  9

Ranking Complete!!



In [52]:
# schema, transformations and content based conditions
s='{"schema": {"nationality": "object", "tonnage (grt)": "int64", "fate": "object","ship":"object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s["schema"];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
get_matches(kg=1)

Input Schema :  {'nationality': 'object', 'tonnage (grt)': 'int64', 'fate': 'object', 'ship': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema
Matching Tables
202_117.csv  :  German submarine U-559
203_148.csv  :  German submarine U-9 (1935)
203_268.csv  :  German submarine U-502
204_100.csv  :  Hans-Rudolf Rosing


202_117.csv(German submarine U-559)
Missing Values(NANs score):  {'date': '0/6', 'ship': '0/6', 'nationality': '0/6', 'tonnage': '0/6', 'fate': '0/6'}
Columns that match with input_schema:
 202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk
3,26 December 1941,SS Warszawa,Poland,2487,Sunk
4,10 June 1942,MV Athene,Norway,4681,Sunk
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged


203_148.csv(German submarine U-9 (1935))
Missing Values(NANs score):  {'date': '0/9', 'name': '0/9', 'nationality': '0/9', 'tonnage (grt)': '0/9', 'fate': '0/9'}
Columns that match with input_schema:
 203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  0.75 	 Completeness Score :  0.75



Unnamed: 0,date,name,nationality,tonnage (grt),fate
0,18 January 1940,Flandria,Sweden,1179,Sunk
1,19 January 1940,Patria,Sweden,1188,Sunk
2,11 February 1940,Linda,Estonia,1213,Sunk
3,4 May 1940,San Tiburcio,United Kingdom,5995,Sunk (mine)
4,9 May 1940,Doris,French Navy,552,Sunk
5,11 May 1940,Tringa,United Kingdom,1930,Sunk
6,11 May 1940,Viiu,Estonia,1908,Sunk
7,23 May 1940,Sigurd Faulbaum,Belgium,3256,Sunk
8,11 May 1944,Shtorm,Soviet Union,412,Damaged


203_268.csv(German submarine U-502)
Missing Values(NANs score):  {'date': '0/16', 'name': '0/16', 'nationality': '0/16', 'tonnage (grt)': '0/16', 'fate': '0/16'}
Columns that match with input_schema:
 203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  0.75 	 Completeness Score :  0.75



Unnamed: 0,date,name,nationality,tonnage (grt),fate
0,7 October 1941,Svend Foyn,United Kingdom,14795,Damaged
1,16 February 1942,Monagas,Venezuela,2650,Sunk
2,16 February 1942,San Nicholas,United Kingdom,2391,Sunk
3,16 February 1942,Tia Juana,United Kingdom,2395,Sunk
4,22 February 1942,J.N.Pew,United States,9033,Sunk
5,23 February 1942,Sun,United States,9002,Damaged
6,23 February 1942,Thalia,Panama,8329,Sunk
7,11 May 1942,Cape of Good Hope,United Kingdom,4963,Sunk
8,24 May 1942,Gonçalves Dias,Brazil,4996,Sunk
9,28 May 1942,Alcoa Pilgrim,United States,6759,Sunk


204_100.csv(Hans-Rudolf Rosing)
Missing Values(NANs score):  {'date': '0/13', 'name of ship': '0/13', 'nationality': '0/13', 'tonnage': '0/13', 'fate': '0/13'}
Columns that match with input_schema:
 204_100.csv :  {'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)', 'name of ship': 'ship'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,5 June 1940,SS Stancor,United Kingdom,798,Sunk at 58°48′N 08°45′W﻿ / ﻿58.800°N 8.750°W
1,7 June 1940,SS Frances Massey,United Kingdom,4212,Sunk at 55°33′N 08°26′W﻿ / ﻿55.550°N 8.433°W
2,7 June 1940,SS Eros,United Kingdom,5888,Damaged at 55°33′N 08°26′W﻿ / ﻿55.550°N 8.433°W
3,11 June 1940,SS Violando N Goulandris,Greece,2375,Sunk at 44°04′N 12°30′W﻿ / ﻿44.067°N 12.500°W
4,19 June 1940,MV Tudor,Norway,6607,Sunk at 45°10′N 11°50′W﻿ / ﻿45.167°N 11.833°W
5,19 June 1940,SS Baron Loudoun,United Kingdom,3164,Sunk at 45°00′N 11°21′W﻿ / ﻿45.000°N 11.350°W
6,19 June 1940,SS British Monarch,United Kingdom,5661,Sunk at 45°00′N 11°21′W﻿ / ﻿45.000°N 11.350°W
7,20 June 1940,MV Moerdrecht,Netherlands,7493,Sunk at 43°34′N 14°20′W﻿ / ﻿43.567°N 14.333°W
8,16 August 1940,SS Hedrun,Sweden,2325,Sunk at 57°10′N 16°37′W﻿ / ﻿57.167°N 16.617°W
9,19 August 1940,SS Ville de Gand,Belgium,7590,Sunk at 55°28′N 15°10′W﻿ / ﻿55.467°N 15.167°W



1
1.csv : 202_117.csv		203_148.csv		203_268.csv		
Missing Values(NANs):  {'date': '0/31', 'ship': '0/31', 'nationality': '0/31', 'tonnage (grt)': '0/31', 'fate': '0/31'}
Columns that match with input_schema: 
202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk
3,26 December 1941,SS Warszawa,Poland,2487,Sunk
4,10 June 1942,MV Athene,Norway,4681,Sunk
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged
6,18 January 1940,Flandria,Sweden,1179,Sunk
7,19 January 1940,Patria,Sweden,1188,Sunk
8,11 February 1940,Linda,Estonia,1213,Sunk
9,4 May 1940,San Tiburcio,United Kingdom,5995,Sunk (mine)



2
2.csv : 202_117.csv		203_148.csv		
Missing Values(NANs):  {'date': '0/15', 'ship': '0/15', 'nationality': '0/15', 'tonnage (grt)': '0/15', 'fate': '0/15'}
Columns that match with input_schema: 
202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk
3,26 December 1941,SS Warszawa,Poland,2487,Sunk
4,10 June 1942,MV Athene,Norway,4681,Sunk
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged
6,18 January 1940,Flandria,Sweden,1179,Sunk
7,19 January 1940,Patria,Sweden,1188,Sunk
8,11 February 1940,Linda,Estonia,1213,Sunk
9,4 May 1940,San Tiburcio,United Kingdom,5995,Sunk (mine)



3
3.csv : 202_117.csv		203_268.csv		
Missing Values(NANs):  {'date': '0/22', 'ship': '0/22', 'nationality': '0/22', 'tonnage (grt)': '0/22', 'fate': '0/22'}
Columns that match with input_schema: 
202_117.csv :  {'ship': 'ship', 'nationality': 'nationality', 'fate': 'fate', 'tonnage': 'tonnage (grt)'}
203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,date,ship,nationality,tonnage (grt),fate
0,19 August 1941,SS Aguila,United Kingdom,3255,Sunk
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060,Sunk
2,23 December 1941,SS Shuntien,United Kingdom,3059,Sunk
3,26 December 1941,SS Warszawa,Poland,2487,Sunk
4,10 June 1942,MV Athene,Norway,4681,Sunk
5,10 June 1942,SS Brambleleaf,United Kingdom,5917,Damaged
6,7 October 1941,Svend Foyn,United Kingdom,14795,Damaged
7,16 February 1942,Monagas,Venezuela,2650,Sunk
8,16 February 1942,San Nicholas,United Kingdom,2391,Sunk
9,16 February 1942,Tia Juana,United Kingdom,2395,Sunk



4
4.csv : 203_148.csv		203_268.csv		
Missing Values(NANs):  {'date': '0/25', 'name': '0/25', 'nationality': '0/25', 'tonnage (grt)': '0/25', 'fate': '0/25'}
Columns that match with input_schema: 
203_148.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
203_268.csv :  {'nationality': 'nationality', 'tonnage (grt)': 'tonnage (grt)', 'fate': 'fate'}
Coverage Score :  0.75 	 Completeness Score :  0.75


Unnamed: 0,date,name,nationality,tonnage (grt),fate
0,18 January 1940,Flandria,Sweden,1179,Sunk
1,19 January 1940,Patria,Sweden,1188,Sunk
2,11 February 1940,Linda,Estonia,1213,Sunk
3,4 May 1940,San Tiburcio,United Kingdom,5995,Sunk (mine)
4,9 May 1940,Doris,French Navy,552,Sunk
5,11 May 1940,Tringa,United Kingdom,1930,Sunk
6,11 May 1940,Viiu,Estonia,1908,Sunk
7,23 May 1940,Sigurd Faulbaum,Belgium,3256,Sunk
8,11 May 1944,Shtorm,Soviet Union,412,Damaged
9,7 October 1941,Svend Foyn,United Kingdom,14795,Damaged




Rank  1   :  output_1.csv         completeness score : 1.0000000000000000 		number of rows:  31
Rank  2   :  output_3.csv         completeness score : 1.0000000000000000 		number of rows:  22
Rank  3   :  output_2.csv         completeness score : 1.0000000000000000 		number of rows:  15
Rank  4   :  204_100.csv          completeness score : 1.0000000000000000 		number of rows:  13
Rank  5   :  202_117.csv          completeness score : 1.0000000000000000 		number of rows:  6
Rank  6   :  output_4.csv         completeness score : 0.7500000000000000 		number of rows:  25
Rank  7   :  203_268.csv          completeness score : 0.7500000000000000 		number of rows:  16
Rank  8   :  203_148.csv          completeness score : 0.7500000000000000 		number of rows:  9

Ranking Complete!!



In [29]:
x={"filename": "204_649.csv", "tablename": "List of tallest buildings in Cleveland", "schema": {"rank": "object", "name": "object", "height ft (m)": "object", "floors": "int64", "year": "int64", "notes": "object"}, "categories": ["ohio", "tallest", "city", "united", "clevelandrelated", "structures", "lists", "buildings", "states", "cleveland", "skyscrapers"]}
print(x["filename"])
print(x["tablename"])
print(x["schema"])
print(x["categories"])

204_649.csv
List of tallest buildings in Cleveland
{'rank': 'object', 'name': 'object', 'height ft (m)': 'object', 'floors': 'int64', 'year': 'int64', 'notes': 'object'}
['ohio', 'tallest', 'city', 'united', 'clevelandrelated', 'structures', 'lists', 'buildings', 'states', 'cleveland', 'skyscrapers']


In [64]:
s='{"schema": {"rank": "object", "name": "object", "height": "object", "no of floors": "int64", "year": "int64"}}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
get_matches(1)

Input Schema :  {'rank': 'object', 'name': 'object', 'height': 'object', 'no of floors': 'int64', 'year': 'int64'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema
Matching Tables
203_837.csv  :  List of tallest buildings in Columbus, Ohio
203_84.csv  :  List of tallest buildings in Oakland, California
204_541.csv  :  List of tallest buildings in Kanpur
204_649.csv  :  List of tallest buildings in Cleveland


203_837.csv(List of tallest buildings in Columbus, Ohio)
Missing Values(NANs score):  {'rank': '0/30', 'name': '0/30', 'height ft / m': '0/30', 'floors': '0/30', 'year': '0/30', 'notes': '24/30'}
Columns that match with input_schema:
 203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,rank,name,height,no of floors,year,notes
0,1,Rhodes State Office Tower,629 / 192,41,1973,Has been the tallest building in Columbus and ...
1,2,LeVeque Tower,555 / 169,47,1927,Tallest building constructed in Columbus in th...
2,3,William Green Building,530 / 162,33,1990,Tallest building constructed in Columbus in th...
3,4,Huntington Center,512 / 156,37,1984,Tallest building constructed in Columbus in th...
4,5,Vern Riffe State Office Tower,503 / 153,32,1988,
5,6,One Nationwide Plaza,485 / 148,40,1976,
6,7,Franklin County Courthouse,464 / 141,27,1991,
7,8,AEP Building,456 / 139,31,1983,
8,9,Borden Building,438 / 134,34,1974,
9,10,Three Nationwide Plaza,408 / 124,27,1989,


203_84.csv(List of tallest buildings in Oakland, California)
Missing Values(NANs score):  {'rank': '0/21', 'name': '0/21', 'height ft (m)': '0/21', 'floors': '0/21', 'year': '2/21', 'coordinates': '0/21', 'notes': '14/21'}
Columns that match with input_schema:
 203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.980952380952381



Unnamed: 0,rank,name,height,no of floors,year,coordinates,notes
0,1,Ordway Building,404 (123),28,1970.0,37°48′36″N 122°15′51″W﻿ / ﻿37.81000°N 122.26417°W,Tallest building in both Oakland and in the Ba...
1,2,Kaiser Center,390 (119),28,1960.0,37°48′32″N 122°15′52″W﻿ / ﻿37.80889°N 122.26444°W,Tallest building constructed in the 1960s
2,3,Lake Merritt Plaza,371 (113),27,1988.0,37°48′27″N 122°15′54″W﻿ / ﻿37.80750°N 122.26500°W,Tallest building constructed in the 1980s
3,4,1111 Broadway,360 (110),24,1990.0,37°48′10″N 122°16′22″W﻿ / ﻿37.80278°N 122.27278°W,Tallest building constructed in the 1990s
4,5,Kaiser Engineering Building,336 (102),25,1984.0,37°48′21″N 122°15′54″W﻿ / ﻿37.80583°N 122.26500°W,
5,6,Clorox Building,330 (101),24,1976.0,37°48′12″N 122°16′20″W﻿ / ﻿37.80333°N 122.27222°W,
6,7=,EM Harris State Office Building,328 (100),22,1998.0,37°48′23″N 122°16′24″W﻿ / ﻿37.80639°N 122.27333°W,
7,7=,Ronald V. Dellums Federal Building South,328 (100),18,1994.0,37°48′16″N 122°16′29″W﻿ / ﻿37.80444°N 122.27472°W,
8,7=,Ronald V. Dellums Federal Building North,328 (100),18,1994.0,37°48′18″N 122°16′28″W﻿ / ﻿37.80500°N 122.27444°W,
9,10,Oakland City Hall,320 (98),14,1914.0,37°48′19″N 122°16′21″W﻿ / ﻿37.80528°N 122.27250°W,Tallest building constructed in the 1910s; Nat...


204_541.csv(List of tallest buildings in Kanpur)
Missing Values(NANs score):  {'name': '0/7', 'area': '0/7', 'height': '0/7', 'floors': '0/7', 'year': '0/7'}
Columns that match with input_schema:
 204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
Coverage Score :  0.8 	 Completeness Score :  0.8



Unnamed: 0,name,area,height,no of floors,year
0,Ratan Planet,"Opp. ALIMCO, Naramau, G.T Road, Kalianpur",90 metres (295 ft),16,2013
1,The Landmark Hotel,"Near Navin Market, Mall Road, Downtown",80 metres (262 ft),14,2000
2,Ratan Orbit,"Mukherjee Vihar, Kalianpur",70 metres (230 ft),14,2011
3,BSNL Tower,"Opp. Resrve Bank of India, Mall Road, Downtown",65 metres (213 ft),15,2001
4,Som Business Square (EL Mart),"Mega Mall Crossing, The Mall, Downtown",80 metres (262 ft),15,2005
5,Akashganga Heights,"City Bypass, Chakeri",60 metres (197 ft),15,1995
6,Krishna Tower,"Opp. Green Park Stadium, Parwati Bagla Road, C...",48 metres (157 ft),10,2003


204_649.csv(List of tallest buildings in Cleveland)
Missing Values(NANs score):  {'rank': '0/32', 'name': '0/32', 'height ft (m)': '0/32', 'floors': '0/32', 'year': '0/32', 'notes': '15/32'}
Columns that match with input_schema:
 204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,rank,name,height,no of floors,year,notes
0,1,Key Tower,947 (289),57,1991,104th-tallest building in the world 20th-talle...
1,2,Terminal Tower,723 (220),52,1930,114th-tallest building in the United States St...
2,3,200 Public Square,658 (201),45,1985,Also known as the BP Building Tallest building...
3,4,Tower at Erieview,529 (161),40,1964,Tallest building constructed in Cleveland in t...
4,5,One Cleveland Center,450 (137),31,1983,
5,6,Fifth Third Center,446 (136),27,1992,
6,7,Federal Court House Tower,430 (131),23,2002,Tallest building constructed in the city in th...
7,8,Justice Center Complex,420 (128),26,1977,Tallest building constructed in the city in th...
8,9,Anthony J. Celebrezze Federal Building,419 (128),31,1967,
9,10,PNC Center,410 (125),35,1980,Originally known as the National City Center; ...



1.csv : 203_837.csv		203_84.csv		204_541.csv		204_649.csv		
Missing Values(NANs):  {'rank': '7/90', 'name': '0/90', 'height': '0/90', 'no of floors': '0/90', 'year': '2/90'}
Columns that match with input_schema: 
203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9800000000000001


Unnamed: 0,rank,name,height,no of floors,year
0,1,Rhodes State Office Tower,629 / 192,41,1973.0
1,2,LeVeque Tower,555 / 169,47,1927.0
2,3,William Green Building,530 / 162,33,1990.0
3,4,Huntington Center,512 / 156,37,1984.0
4,5,Vern Riffe State Office Tower,503 / 153,32,1988.0
5,6,One Nationwide Plaza,485 / 148,40,1976.0
6,7,Franklin County Courthouse,464 / 141,27,1991.0
7,8,AEP Building,456 / 139,31,1983.0
8,9,Borden Building,438 / 134,34,1974.0
9,10,Three Nationwide Plaza,408 / 124,27,1989.0



2.csv : 203_837.csv		203_84.csv		204_541.csv		
Missing Values(NANs):  {'rank': '7/58', 'name': '0/58', 'height': '0/58', 'no of floors': '0/58', 'year': '2/58'}
Columns that match with input_schema: 
203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9689655172413794


Unnamed: 0,rank,name,height,no of floors,year
0,1,Rhodes State Office Tower,629 / 192,41,1973.0
1,2,LeVeque Tower,555 / 169,47,1927.0
2,3,William Green Building,530 / 162,33,1990.0
3,4,Huntington Center,512 / 156,37,1984.0
4,5,Vern Riffe State Office Tower,503 / 153,32,1988.0
5,6,One Nationwide Plaza,485 / 148,40,1976.0
6,7,Franklin County Courthouse,464 / 141,27,1991.0
7,8,AEP Building,456 / 139,31,1983.0
8,9,Borden Building,438 / 134,34,1974.0
9,10,Three Nationwide Plaza,408 / 124,27,1989.0



3.csv : 203_837.csv		203_84.csv		204_649.csv		
Missing Values(NANs):  {'rank': '0/83', 'name': '0/83', 'height': '0/83', 'no of floors': '0/83', 'year': '2/83'}
Columns that match with input_schema: 
203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9951807228915662


Unnamed: 0,rank,name,height,no of floors,year
0,1,Rhodes State Office Tower,629 / 192,41,1973.0
1,2,LeVeque Tower,555 / 169,47,1927.0
2,3,William Green Building,530 / 162,33,1990.0
3,4,Huntington Center,512 / 156,37,1984.0
4,5,Vern Riffe State Office Tower,503 / 153,32,1988.0
5,6,One Nationwide Plaza,485 / 148,40,1976.0
6,7,Franklin County Courthouse,464 / 141,27,1991.0
7,8,AEP Building,456 / 139,31,1983.0
8,9,Borden Building,438 / 134,34,1974.0
9,10,Three Nationwide Plaza,408 / 124,27,1989.0



4.csv : 203_837.csv		204_541.csv		204_649.csv		
Missing Values(NANs):  {'rank': '7/69', 'name': '0/69', 'height': '0/69', 'no of floors': '0/69', 'year': '0/69'}
Columns that match with input_schema: 
203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9797101449275363


Unnamed: 0,rank,name,height,no of floors,year
0,1,Rhodes State Office Tower,629 / 192,41,1973
1,2,LeVeque Tower,555 / 169,47,1927
2,3,William Green Building,530 / 162,33,1990
3,4,Huntington Center,512 / 156,37,1984
4,5,Vern Riffe State Office Tower,503 / 153,32,1988
5,6,One Nationwide Plaza,485 / 148,40,1976
6,7,Franklin County Courthouse,464 / 141,27,1991
7,8,AEP Building,456 / 139,31,1983
8,9,Borden Building,438 / 134,34,1974
9,10,Three Nationwide Plaza,408 / 124,27,1989



5.csv : 203_84.csv		204_541.csv		204_649.csv		
Missing Values(NANs):  {'rank': '7/60', 'name': '0/60', 'height': '0/60', 'no of floors': '0/60', 'year': '2/60'}
Columns that match with input_schema: 
203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.97


Unnamed: 0,rank,name,height,no of floors,year
0,1,Ordway Building,404 (123),28,1970.0
1,2,Kaiser Center,390 (119),28,1960.0
2,3,Lake Merritt Plaza,371 (113),27,1988.0
3,4,1111 Broadway,360 (110),24,1990.0
4,5,Kaiser Engineering Building,336 (102),25,1984.0
5,6,Clorox Building,330 (101),24,1976.0
6,7=,EM Harris State Office Building,328 (100),22,1998.0
7,7=,Ronald V. Dellums Federal Building South,328 (100),18,1994.0
8,7=,Ronald V. Dellums Federal Building North,328 (100),18,1994.0
9,10,Oakland City Hall,320 (98),14,1914.0



6.csv : 203_837.csv		203_84.csv		
Missing Values(NANs):  {'rank': '0/51', 'name': '0/51', 'height': '0/51', 'no of floors': '0/51', 'year': '2/51'}
Columns that match with input_schema: 
203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9921568627450981


Unnamed: 0,rank,name,height,no of floors,year
0,1,Rhodes State Office Tower,629 / 192,41,1973.0
1,2,LeVeque Tower,555 / 169,47,1927.0
2,3,William Green Building,530 / 162,33,1990.0
3,4,Huntington Center,512 / 156,37,1984.0
4,5,Vern Riffe State Office Tower,503 / 153,32,1988.0
5,6,One Nationwide Plaza,485 / 148,40,1976.0
6,7,Franklin County Courthouse,464 / 141,27,1991.0
7,8,AEP Building,456 / 139,31,1983.0
8,9,Borden Building,438 / 134,34,1974.0
9,10,Three Nationwide Plaza,408 / 124,27,1989.0



7.csv : 203_837.csv		204_541.csv		
Missing Values(NANs):  {'rank': '7/37', 'name': '0/37', 'height': '0/37', 'no of floors': '0/37', 'year': '0/37'}
Columns that match with input_schema: 
203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9621621621621621


Unnamed: 0,rank,name,height,no of floors,year
0,1,Rhodes State Office Tower,629 / 192,41,1973
1,2,LeVeque Tower,555 / 169,47,1927
2,3,William Green Building,530 / 162,33,1990
3,4,Huntington Center,512 / 156,37,1984
4,5,Vern Riffe State Office Tower,503 / 153,32,1988
5,6,One Nationwide Plaza,485 / 148,40,1976
6,7,Franklin County Courthouse,464 / 141,27,1991
7,8,AEP Building,456 / 139,31,1983
8,9,Borden Building,438 / 134,34,1974
9,10,Three Nationwide Plaza,408 / 124,27,1989



8.csv : 203_837.csv		204_649.csv		
Missing Values(NANs):  {'rank': '0/62', 'name': '0/62', 'height': '0/62', 'no of floors': '0/62', 'year': '0/62'}
Columns that match with input_schema: 
203_837.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft / m': 'height', 'floors': 'no of floors'}
204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,rank,name,height,no of floors,year
0,1,Rhodes State Office Tower,629 / 192,41,1973
1,2,LeVeque Tower,555 / 169,47,1927
2,3,William Green Building,530 / 162,33,1990
3,4,Huntington Center,512 / 156,37,1984
4,5,Vern Riffe State Office Tower,503 / 153,32,1988
5,6,One Nationwide Plaza,485 / 148,40,1976
6,7,Franklin County Courthouse,464 / 141,27,1991
7,8,AEP Building,456 / 139,31,1983
8,9,Borden Building,438 / 134,34,1974
9,10,Three Nationwide Plaza,408 / 124,27,1989



9.csv : 203_84.csv		204_541.csv		
Missing Values(NANs):  {'rank': '7/28', 'name': '0/28', 'height': '0/28', 'no of floors': '0/28', 'year': '2/28', 'coordinates': '7/28'}
Columns that match with input_schema: 
203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9357142857142857


Unnamed: 0,rank,name,height,no of floors,year,coordinates
0,1,Ordway Building,404 (123),28,1970.0,37°48′36″N 122°15′51″W﻿ / ﻿37.81000°N 122.26417°W
1,2,Kaiser Center,390 (119),28,1960.0,37°48′32″N 122°15′52″W﻿ / ﻿37.80889°N 122.26444°W
2,3,Lake Merritt Plaza,371 (113),27,1988.0,37°48′27″N 122°15′54″W﻿ / ﻿37.80750°N 122.26500°W
3,4,1111 Broadway,360 (110),24,1990.0,37°48′10″N 122°16′22″W﻿ / ﻿37.80278°N 122.27278°W
4,5,Kaiser Engineering Building,336 (102),25,1984.0,37°48′21″N 122°15′54″W﻿ / ﻿37.80583°N 122.26500°W
5,6,Clorox Building,330 (101),24,1976.0,37°48′12″N 122°16′20″W﻿ / ﻿37.80333°N 122.27222°W
6,7=,EM Harris State Office Building,328 (100),22,1998.0,37°48′23″N 122°16′24″W﻿ / ﻿37.80639°N 122.27333°W
7,7=,Ronald V. Dellums Federal Building South,328 (100),18,1994.0,37°48′16″N 122°16′29″W﻿ / ﻿37.80444°N 122.27472°W
8,7=,Ronald V. Dellums Federal Building North,328 (100),18,1994.0,37°48′18″N 122°16′28″W﻿ / ﻿37.80500°N 122.27444°W
9,10,Oakland City Hall,320 (98),14,1914.0,37°48′19″N 122°16′21″W﻿ / ﻿37.80528°N 122.27250°W



10.csv : 203_84.csv		204_649.csv		
Missing Values(NANs):  {'rank': '0/53', 'name': '0/53', 'height': '0/53', 'no of floors': '0/53', 'year': '2/53'}
Columns that match with input_schema: 
203_84.csv :  {'rank': 'rank', 'name': 'name', 'height ft (m)': 'height', 'floors': 'no of floors'}
204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.9924528301886791


Unnamed: 0,rank,name,height,no of floors,year
0,1,Ordway Building,404 (123),28,1970.0
1,2,Kaiser Center,390 (119),28,1960.0
2,3,Lake Merritt Plaza,371 (113),27,1988.0
3,4,1111 Broadway,360 (110),24,1990.0
4,5,Kaiser Engineering Building,336 (102),25,1984.0
5,6,Clorox Building,330 (101),24,1976.0
6,7=,EM Harris State Office Building,328 (100),22,1998.0
7,7=,Ronald V. Dellums Federal Building South,328 (100),18,1994.0
8,7=,Ronald V. Dellums Federal Building North,328 (100),18,1994.0
9,10,Oakland City Hall,320 (98),14,1914.0



11.csv : 204_541.csv		204_649.csv		
Missing Values(NANs):  {'name': '0/39', 'height': '0/39', 'no of floors': '0/39', 'year': '0/39', 'rank': '7/39'}
Columns that match with input_schema: 
204_541.csv :  {'name': 'name', 'height': 'height', 'year': 'year', 'floors': 'no of floors'}
204_649.csv :  {'rank': 'rank', 'name': 'name', 'year': 'year', 'height ft (m)': 'height', 'floors': 'no of floors'}
Coverage Score :  1.0 	 Completeness Score :  0.964102564102564


Unnamed: 0,name,height,no of floors,year,rank
0,Ratan Planet,90 metres (295 ft),16,2013,
1,The Landmark Hotel,80 metres (262 ft),14,2000,
2,Ratan Orbit,70 metres (230 ft),14,2011,
3,BSNL Tower,65 metres (213 ft),15,2001,
4,Som Business Square (EL Mart),80 metres (262 ft),15,2005,
5,Akashganga Heights,60 metres (197 ft),15,1995,
6,Krishna Tower,48 metres (157 ft),10,2003,
7,Key Tower,947 (289),57,1991,1
8,Terminal Tower,723 (220),52,1930,2
9,200 Public Square,658 (201),45,1985,3




Rank  1   :  output_8.csv         completeness score : 1.0000000000000000 		number of rows:  62
Rank  2   :  204_649.csv          completeness score : 1.0000000000000000 		number of rows:  32
Rank  3   :  203_837.csv          completeness score : 1.0000000000000000 		number of rows:  30
Rank  4   :  output_3.csv         completeness score : 0.9951807228915662 		number of rows:  83
Rank  5   :  output_10.csv        completeness score : 0.9924528301886791 		number of rows:  53
Rank  6   :  output_6.csv         completeness score : 0.9921568627450981 		number of rows:  51
Rank  7   :  203_84.csv           completeness score : 0.9809523809523810 		number of rows:  21
Rank  8   :  output_1.csv         completeness score : 0.9800000000000001 		number of rows:  90
Rank  9   :  output_4.csv         completeness score : 0.9797101449275363 		number of rows:  69
Rank  10  :  output_5.csv         completeness score : 0.9700000000000000 		number of rows:  60
Rank  11  :  output_2.csv         comp

In [80]:
s='{"schema":{"phase": "object", "date": "object", "terrain": "object", "length": "object", "race winner": "object", "leader": "object"},"categories":["france","cycling","sports","events"]}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
get_matches()

Input Schema :  {'phase': 'object', 'date': 'object', 'terrain': 'object', 'length': 'object', 'race winner': 'object', 'leader': 'object'}
Input Categories :  ['france', 'cycling', 'sports', 'events']
Input Transformations : None
Input Condition based conditions : None

category and schema
Matching Tables
203_25.csv  :  1983 Tour de France
203_44.csv  :  1964 Tour de France
203_475.csv  :  1995 Tour de France
204_312.csv  :  1978 Tour de France
204_406.csv  :  1931 Tour de France


203_25.csv(1983 Tour de France)
Missing Values(NANs score):  {'stage': '0/23', 'date': '0/23', 'route': '21/23', 'terrain': '0/23', 'length': '0/23', 'winner': '0/23'}
Columns that match with input_schema:
 203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334



Unnamed: 0,phase,date,route,terrain,length,race winner
0,P,1 July,Fontenay-sous-Bois,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,,Team time trial,100 km (62 mi),Mercier
3,3,4 July,,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)


203_44.csv(1964 Tour de France)
Missing Values(NANs score):  {'stage': '0/25', 'date': '0/25', 'terrain': '0/25', 'length': '0/25', 'winner': '0/25'}
Columns that match with input_schema:
 203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334



Unnamed: 0,phase,date,terrain,length,race winner
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL)
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA)
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL)
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER)
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL)
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED)
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED)
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP)
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA)


203_475.csv(1995 Tour de France)
Missing Values(NANs score):  {'stage': '0/21', 'date': '0/21', 'route': '20/21', 'terrain': '0/21', 'length': '0/21', 'winner': '0/21'}
Columns that match with input_schema:
 203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334



Unnamed: 0,phase,date,route,terrain,length,race winner
0,P,1 July,Saint-Brieuc,Individual time trial,7.3 km (4.5 mi),Jacky Durand (FRA)
1,1,2 July,,Plain stage,233.5 km (145.1 mi),Fabio Baldato (ITA)
2,2,3 July,,Plain stage,235.5 km (146.3 mi),Mario Cipollini (ITA)
3,3,4 July,,Team time trial,67.0 km (41.6 mi),Gewiss-Ballan (ITA)
4,4,5 July,,Plain stage,162.0 km (100.7 mi),Mario Cipollini (ITA)
5,5,6 July,,Plain stage,261.0 km (162.2 mi),Jeroen Blijlevens (NED)
6,6,7 July,,Plain stage,202.0 km (125.5 mi),Erik Zabel (GER)
7,7,8 July,,Hilly stage,203.0 km (126.1 mi),Johan Bruyneel (BEL)
8,8,9 July,,Individual time trial,54.0 km (33.6 mi),Miguel Indurain (ESP)
9,9,11 July,,Stage with mountain(s),160.0 km (99.4 mi),Alex Zulle (SUI)


204_312.csv(1978 Tour de France)
Missing Values(NANs score):  {'stage': '0/25', 'date': '0/25', 'route': '24/25', 'terrain': '0/25', 'length': '0/25', 'winner': '0/25'}
Columns that match with input_schema:
 204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334



Unnamed: 0,phase,date,route,terrain,length,race winner
0,P,29 June,Leiden,Individual time trial,5 km (3.1 mi),Jan Raas (NED)
1,1A,30 June,,Plain stage,135 km (84 mi),Jan Raas (NED)
2,1B,30 June,,Plain stage,100 km (62 mi),Walter Planckaert (BEL)
3,2,1 July,,Plain stage,199 km (124 mi),Jacques Esclassan (FRA)
4,3,2 July,,Plain stage,244 km (152 mi),Klaus-Peter Thaler (GER)
5,4,3 July,,Team time trial,153 km (95 mi),TI-Raleigh
6,5,4 July,,Plain stage,244 km (152 mi),Freddy Maertens (BEL)
7,6,5 July,,Plain stage,162 km (101 mi),Sean Kelly (IRE)
8,7,6 July,,Plain stage,242 km (150 mi),Freddy Maertens (BEL)
9,8,7 July,,Individual time trial,59 km (37 mi),Bernard Hinault (FRA)


204_406.csv(1931 Tour de France)
Missing Values(NANs score):  {'stage': '0/24', 'date': '0/24', 'terrain': '0/24', 'length': '0/24', 'winner': '0/24', 'race leader': '0/24'}
Columns that match with input_schema:
 204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334



Unnamed: 0,stage,date,terrain,length,race winner,leader
0,1,30 June,Plain stage,208 km (129 mi),Alfred Haemerlinck (BEL),Alfred Haemerlinck (BEL)
1,2,1 July,Plain stage,212 km (132 mi),Max Bulla (AUT),Max Bulla (AUT)
2,3,2 July,Plain stage,206 km (128 mi),Fabio Battesini (ITA),Leon Le Calvez (FRA)
3,4,3 July,Plain stage,211 km (131 mi),Andre Godinat (FRA),Rafaele di Paco (ITA)
4,5,4 July,Plain stage,202 km (126 mi),Charles Pelissier (FRA),Charles Pelissier (FRA) Rafaele di Paco (ITA)
5,6,5 July,Plain stage,338 km (210 mi),Alfred Haemerlinck (BEL),Rafaele di Paco (ITA)
6,7,6 July,Plain stage,180 km (110 mi),Gerard Loncke (BEL),Rafaele di Paco (ITA)
7,8,7 July,Plain stage,106 km (66 mi),Charles Pelissier (FRA),Charles Pelissier (FRA)
8,9,8 July,Stage with mountain(s),231 km (144 mi),Antonin Magne (FRA),Antonin Magne (FRA)
9,10,10 July,Stage with mountain(s),322 km (200 mi),Rafaele di Paco (ITA),Antonin Magne (FRA)



1.csv : 203_25.csv		203_44.csv		203_475.csv		204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/118', 'date': '0/118', 'terrain': '0/118', 'length': '0/118', 'race winner': '0/118', 'leader': '94/118'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333335


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



2.csv : 203_25.csv		203_44.csv		203_475.csv		204_312.csv		
Missing Values(NANs):  {'phase': '0/94', 'date': '0/94', 'terrain': '0/94', 'length': '0/94', 'race winner': '0/94'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,Team time trial,100 km (62 mi),Mercier
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)



3.csv : 203_25.csv		203_44.csv		203_475.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/93', 'date': '0/93', 'terrain': '0/93', 'length': '0/93', 'race winner': '0/93', 'leader': '69/93'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



4.csv : 203_25.csv		203_44.csv		204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/97', 'date': '0/97', 'terrain': '0/97', 'length': '0/97', 'race winner': '0/97', 'leader': '73/97'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



5.csv : 203_25.csv		203_475.csv		204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/93', 'date': '0/93', 'terrain': '0/93', 'length': '0/93', 'race winner': '0/93', 'leader': '69/93'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



6.csv : 203_44.csv		203_475.csv		204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/95', 'date': '0/95', 'terrain': '0/95', 'length': '0/95', 'race winner': '0/95', 'leader': '71/95'}
Columns that match with input_schema: 
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL),
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA),
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL),
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol,
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER),
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL),
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED),
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED),
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP),
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA),



7.csv : 203_25.csv		203_44.csv		203_475.csv		
Missing Values(NANs):  {'phase': '0/69', 'date': '0/69', 'terrain': '0/69', 'length': '0/69', 'race winner': '0/69'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,Team time trial,100 km (62 mi),Mercier
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)



8.csv : 203_25.csv		203_44.csv		204_312.csv		
Missing Values(NANs):  {'phase': '0/73', 'date': '0/73', 'terrain': '0/73', 'length': '0/73', 'race winner': '0/73'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,Team time trial,100 km (62 mi),Mercier
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)



9.csv : 203_25.csv		203_44.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/72', 'date': '0/72', 'terrain': '0/72', 'length': '0/72', 'race winner': '0/72', 'leader': '48/72'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333331


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



10.csv : 203_25.csv		203_475.csv		204_312.csv		
Missing Values(NANs):  {'phase': '0/69', 'date': '0/69', 'terrain': '0/69', 'length': '0/69', 'race winner': '0/69'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,Team time trial,100 km (62 mi),Mercier
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)



11.csv : 203_25.csv		203_475.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/68', 'date': '0/68', 'terrain': '0/68', 'length': '0/68', 'race winner': '0/68', 'leader': '44/68'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333331


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



12.csv : 203_25.csv		204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/72', 'date': '0/72', 'terrain': '0/72', 'length': '0/72', 'race winner': '0/72', 'leader': '48/72'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333331


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



13.csv : 203_44.csv		203_475.csv		204_312.csv		
Missing Values(NANs):  {'phase': '0/71', 'date': '0/71', 'terrain': '0/71', 'length': '0/71', 'race winner': '0/71'}
Columns that match with input_schema: 
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL)
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA)
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL)
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER)
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL)
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED)
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED)
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP)
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA)



14.csv : 203_44.csv		203_475.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/70', 'date': '0/70', 'terrain': '0/70', 'length': '0/70', 'race winner': '0/70', 'leader': '46/70'}
Columns that match with input_schema: 
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333331


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL),
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA),
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL),
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol,
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER),
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL),
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED),
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED),
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP),
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA),



15.csv : 203_44.csv		204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/74', 'date': '0/74', 'terrain': '0/74', 'length': '0/74', 'race winner': '0/74', 'leader': '50/74'}
Columns that match with input_schema: 
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL),
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA),
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL),
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol,
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER),
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL),
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED),
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED),
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP),
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA),



16.csv : 203_475.csv		204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/70', 'date': '0/70', 'terrain': '0/70', 'length': '0/70', 'race winner': '0/70', 'leader': '46/70'}
Columns that match with input_schema: 
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333331


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,7.3 km (4.5 mi),Jacky Durand (FRA),
1,1,2 July,Plain stage,233.5 km (145.1 mi),Fabio Baldato (ITA),
2,2,3 July,Plain stage,235.5 km (146.3 mi),Mario Cipollini (ITA),
3,3,4 July,Team time trial,67.0 km (41.6 mi),Gewiss-Ballan (ITA),
4,4,5 July,Plain stage,162.0 km (100.7 mi),Mario Cipollini (ITA),
5,5,6 July,Plain stage,261.0 km (162.2 mi),Jeroen Blijlevens (NED),
6,6,7 July,Plain stage,202.0 km (125.5 mi),Erik Zabel (GER),
7,7,8 July,Hilly stage,203.0 km (126.1 mi),Johan Bruyneel (BEL),
8,8,9 July,Individual time trial,54.0 km (33.6 mi),Miguel Indurain (ESP),
9,9,11 July,Stage with mountain(s),160.0 km (99.4 mi),Alex Zulle (SUI),



17.csv : 203_25.csv		203_44.csv		
Missing Values(NANs):  {'phase': '0/48', 'date': '0/48', 'terrain': '0/48', 'length': '0/48', 'race winner': '0/48'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,Team time trial,100 km (62 mi),Mercier
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)



18.csv : 203_25.csv		203_475.csv		
Missing Values(NANs):  {'phase': '0/44', 'date': '0/44', 'terrain': '0/44', 'length': '0/44', 'race winner': '0/44'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,Team time trial,100 km (62 mi),Mercier
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)



19.csv : 203_25.csv		204_312.csv		
Missing Values(NANs):  {'phase': '0/48', 'date': '0/48', 'terrain': '0/48', 'length': '0/48', 'race winner': '0/48'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL)
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED)
2,2,3 July,Team time trial,100 km (62 mi),Mercier
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL)
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI)
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA)
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED)
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA)
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED)
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA)



20.csv : 203_25.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/47', 'date': '0/47', 'terrain': '0/47', 'length': '0/47', 'race winner': '0/47', 'leader': '23/47'}
Columns that match with input_schema: 
203_25.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,6 km (3.7 mi),Eric Vanderaerden (BEL),
1,1,2 July,Plain stage,163 km (101 mi),Frits Pirard (NED),
2,2,3 July,Team time trial,100 km (62 mi),Mercier,
3,3,4 July,Hilly stage,152 km (94 mi),Rudy Matthijs (BEL),
4,4,5 July,Plain stage,300 km (190 mi),Serge Demierre (SUI),
5,5,6 July,Plain stage,257 km (160 mi),Dominique Gaigne (FRA),
6,6,7 July,Individual time trial,58 km (36 mi),Bert Oosterbosch (NED),
7,7,8 July,Plain stage,216 km (134 mi),Riccardo Magrini (ITA),
8,8,9 July,Plain stage,222 km (138 mi),Bert Oosterbosch (NED),
9,9,10 July,Plain stage,207 km (129 mi),Philippe Chevallier (FRA),



21.csv : 203_44.csv		203_475.csv		
Missing Values(NANs):  {'phase': '0/46', 'date': '0/46', 'terrain': '0/46', 'length': '0/46', 'race winner': '0/46'}
Columns that match with input_schema: 
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL)
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA)
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL)
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER)
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL)
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED)
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED)
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP)
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA)



22.csv : 203_44.csv		204_312.csv		
Missing Values(NANs):  {'phase': '0/50', 'date': '0/50', 'terrain': '0/50', 'length': '0/50', 'race winner': '0/50'}
Columns that match with input_schema: 
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL)
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA)
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL)
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER)
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL)
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED)
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED)
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP)
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA)



23.csv : 203_44.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/49', 'date': '0/49', 'terrain': '0/49', 'length': '0/49', 'race winner': '0/49', 'leader': '25/49'}
Columns that match with input_schema: 
203_44.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333331


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,1,22 June,Plain stage,215 km (134 mi),Edward Sels (BEL),
1,2,23 June,Plain stage,208 km (129 mi),Andre Darrigade (FRA),
2,3A,24 June,Plain stage,197 km (122 mi),Bernard Vandekerkhove (BEL),
3,3B,24 June,Team time trial,21 km (13 mi),Kas-Kaskol,
4,4,25 June,Plain stage,292 km (181 mi),Rudi Altig (GER),
5,5,26 June,Plain stage,161 km (100 mi),Willy Derboven (BEL),
6,6,27 June,Plain stage,200 km (120 mi),Henk Nijdam (NED),
7,7,28 June,Plain stage,195 km (121 mi),Jan Janssen (NED),
8,8,29 June,Stage with mountain(s),249 km (155 mi),Federico Bahamontes (ESP),
9,9,30 June,Stage with mountain(s),239 km (149 mi),Jacques Anquetil (FRA),



24.csv : 203_475.csv		204_312.csv		
Missing Values(NANs):  {'phase': '0/46', 'date': '0/46', 'terrain': '0/46', 'length': '0/46', 'race winner': '0/46'}
Columns that match with input_schema: 
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
Coverage Score :  0.8333333333333334 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner
0,P,1 July,Individual time trial,7.3 km (4.5 mi),Jacky Durand (FRA)
1,1,2 July,Plain stage,233.5 km (145.1 mi),Fabio Baldato (ITA)
2,2,3 July,Plain stage,235.5 km (146.3 mi),Mario Cipollini (ITA)
3,3,4 July,Team time trial,67.0 km (41.6 mi),Gewiss-Ballan (ITA)
4,4,5 July,Plain stage,162.0 km (100.7 mi),Mario Cipollini (ITA)
5,5,6 July,Plain stage,261.0 km (162.2 mi),Jeroen Blijlevens (NED)
6,6,7 July,Plain stage,202.0 km (125.5 mi),Erik Zabel (GER)
7,7,8 July,Hilly stage,203.0 km (126.1 mi),Johan Bruyneel (BEL)
8,8,9 July,Individual time trial,54.0 km (33.6 mi),Miguel Indurain (ESP)
9,9,11 July,Stage with mountain(s),160.0 km (99.4 mi),Alex Zulle (SUI)



25.csv : 203_475.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/45', 'date': '0/45', 'terrain': '0/45', 'length': '0/45', 'race winner': '0/45', 'leader': '21/45'}
Columns that match with input_schema: 
203_475.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333334


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,1 July,Individual time trial,7.3 km (4.5 mi),Jacky Durand (FRA),
1,1,2 July,Plain stage,233.5 km (145.1 mi),Fabio Baldato (ITA),
2,2,3 July,Plain stage,235.5 km (146.3 mi),Mario Cipollini (ITA),
3,3,4 July,Team time trial,67.0 km (41.6 mi),Gewiss-Ballan (ITA),
4,4,5 July,Plain stage,162.0 km (100.7 mi),Mario Cipollini (ITA),
5,5,6 July,Plain stage,261.0 km (162.2 mi),Jeroen Blijlevens (NED),
6,6,7 July,Plain stage,202.0 km (125.5 mi),Erik Zabel (GER),
7,7,8 July,Hilly stage,203.0 km (126.1 mi),Johan Bruyneel (BEL),
8,8,9 July,Individual time trial,54.0 km (33.6 mi),Miguel Indurain (ESP),
9,9,11 July,Stage with mountain(s),160.0 km (99.4 mi),Alex Zulle (SUI),



26.csv : 204_312.csv		204_406.csv		
Missing Values(NANs):  {'phase': '24/49', 'date': '0/49', 'terrain': '0/49', 'length': '0/49', 'race winner': '0/49', 'leader': '25/49'}
Columns that match with input_schema: 
204_312.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'stage': 'phase', 'winner': 'race winner'}
204_406.csv :  {'date': 'date', 'terrain': 'terrain', 'length': 'length', 'winner': 'race winner', 'race leader': 'leader'}
Coverage Score :  1.0 	 Completeness Score :  0.8333333333333331


Unnamed: 0,phase,date,terrain,length,race winner,leader
0,P,29 June,Individual time trial,5 km (3.1 mi),Jan Raas (NED),
1,1A,30 June,Plain stage,135 km (84 mi),Jan Raas (NED),
2,1B,30 June,Plain stage,100 km (62 mi),Walter Planckaert (BEL),
3,2,1 July,Plain stage,199 km (124 mi),Jacques Esclassan (FRA),
4,3,2 July,Plain stage,244 km (152 mi),Klaus-Peter Thaler (GER),
5,4,3 July,Team time trial,153 km (95 mi),TI-Raleigh,
6,5,4 July,Plain stage,244 km (152 mi),Freddy Maertens (BEL),
7,6,5 July,Plain stage,162 km (101 mi),Sean Kelly (IRE),
8,7,6 July,Plain stage,242 km (150 mi),Freddy Maertens (BEL),
9,8,7 July,Individual time trial,59 km (37 mi),Bernard Hinault (FRA),




Rank  1   :  output_1.csv         completeness score : 0.8333333333333335 		number of rows:  118
Rank  2   :  output_4.csv         completeness score : 0.8333333333333334 		number of rows:  97
Rank  3   :  output_6.csv         completeness score : 0.8333333333333334 		number of rows:  95
Rank  4   :  output_2.csv         completeness score : 0.8333333333333334 		number of rows:  94
Rank  5   :  output_3.csv         completeness score : 0.8333333333333334 		number of rows:  93
Rank  6   :  output_5.csv         completeness score : 0.8333333333333334 		number of rows:  93
Rank  7   :  output_15.csv        completeness score : 0.8333333333333334 		number of rows:  74
Rank  8   :  output_8.csv         completeness score : 0.8333333333333334 		number of rows:  73
Rank  9   :  output_13.csv        completeness score : 0.8333333333333334 		number of rows:  71
Rank  10  :  output_7.csv         completeness score : 0.8333333333333334 		number of rows:  69
Rank  11  :  output_10.csv        com

In [88]:
s='{"schema": {"team 1": "object", "team 2": "object"}, "categories": ["football"]}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
get_matches()

Input Schema :  {'team 1': 'object', 'team 2': 'object'}
Input Categories :  ['football']
Input Transformations : None
Input Condition based conditions : None

category and schema
Matching Tables
203_414.csv  :  2008-09 Copa del Rey
203_526.csv  :  2006 UEFA European Under-21 Championship qualification
204_405.csv  :  2011-12 Taca da Liga
204_510.csv  :  2010-11 UEFA Europa League
204_702.csv  :  1977-78 Copa del Rey


203_414.csv(2008-09 Copa del Rey)
Missing Values(NANs score):  {'team 1': '0/16', 'team 2': '0/16'}
Columns that match with input_schema:
 203_414.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,team 1,team 2
0,Real Union,Real Madrid
1,Portugalete,Valencia
2,Ponferradina,Sevilla
3,Orihuela,Atletico
4,Poli Ejido,Villarreal
5,Hercules,Valladolid
6,Rayo Vallecano,Almeria
7,Celta,Espanyol
8,Castellon,Betis
9,Real Murcia,Racing


203_526.csv(2006 UEFA European Under-21 Championship qualification)
Missing Values(NANs score):  {'team 1': '0/8', 'team 2': '0/8'}
Columns that match with input_schema:
 203_526.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,team 1,team 2
0,England,France
1,Czech Republic,Germany
2,Hungary,Italy
3,Serbia and Montenegro,Croatia
4,Ukraine,Belgium
5,Russia,Denmark
6,Switzerland,Portugal
7,Slovenia,Netherlands


204_405.csv(2011-12 Taca da Liga)
Missing Values(NANs score):  {'team 1': '0/8', 'team 2': '0/8'}
Columns that match with input_schema:
 204_405.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,team 1,team 2
0,União da Madeira,Maritimo
1,Penafiel,Academica
2,Santa Clara,União de Leiria
3,Naval,Vitoria de Setúbal
4,Moreirense,Beira-Mar
5,Portimonense,Feirense
6,Estoril,Olhanense
7,Belenenses,Gil Vicente


204_510.csv(2010-11 UEFA Europa League)
Missing Values(NANs score):  {'team 1': '0/8', 'team 2': '0/8'}
Columns that match with input_schema:
 204_510.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,team 1,team 2
0,Benfica,Paris Saint-Germain
1,Dynamo Kyiv,Manchester City
2,Twente,Zenit Saint Petersburg
3,CSKA Moscow,Porto
4,PSV Eindhoven,Rangers
5,Bayer Leverkusen,Villarreal
6,Ajax,Spartak Moscow
7,Braga,Liverpool


204_702.csv(1977-78 Copa del Rey)
Missing Values(NANs score):  {'team 1': '0/42', 'team 2': '0/42'}
Columns that match with input_schema:
 204_702.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,team 1,team 2
0,Valladolid Promesas,Elche
1,Pena Sport,Valladolid
2,Algeciras,Real Madrid
3,Real Madrid Castilla,Sabadell
4,Leganes,Eldense
5,Ciempozuelos,Lleida
6,Alcalá,Valencia
7,Burgos,Huesca
8,Ceuta,Espanyol
9,Atletico Baleares,Cádiz



1.csv : 204_405.csv(2011-12 Taca da Liga)		204_510.csv(2010-11 UEFA Europa League)		204_702.csv(1977-78 Copa del Rey)		
Missing Values(NANs):  {'team 1': '0/58', 'team 2': '0/58'}
Columns that match with input_schema: 
204_405.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
204_510.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
204_702.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,team 1,team 2
0,União da Madeira,Maritimo
1,Penafiel,Academica
2,Santa Clara,União de Leiria
3,Naval,Vitoria de Setúbal
4,Moreirense,Beira-Mar
5,Portimonense,Feirense
6,Estoril,Olhanense
7,Belenenses,Gil Vicente
8,Benfica,Paris Saint-Germain
9,Dynamo Kyiv,Manchester City



2.csv : 203_414.csv(2008-09 Copa del Rey)		204_702.csv(1977-78 Copa del Rey)		
Missing Values(NANs):  {'team 1': '0/58', 'team 2': '0/58'}
Columns that match with input_schema: 
203_414.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
204_702.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,team 1,team 2
0,Real Union,Real Madrid
1,Portugalete,Valencia
2,Ponferradina,Sevilla
3,Orihuela,Atletico
4,Poli Ejido,Villarreal
5,Hercules,Valladolid
6,Rayo Vallecano,Almeria
7,Celta,Espanyol
8,Castellon,Betis
9,Real Murcia,Racing



3.csv : 203_526.csv(2006 UEFA European Under-21 Championship qualification)		204_510.csv(2010-11 UEFA Europa League)		
Missing Values(NANs):  {'team 1': '0/16', 'team 2': '0/16'}
Columns that match with input_schema: 
203_526.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
204_510.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,team 1,team 2
0,England,France
1,Czech Republic,Germany
2,Hungary,Italy
3,Serbia and Montenegro,Croatia
4,Ukraine,Belgium
5,Russia,Denmark
6,Switzerland,Portugal
7,Slovenia,Netherlands
8,Benfica,Paris Saint-Germain
9,Dynamo Kyiv,Manchester City



4.csv : 204_405.csv(2011-12 Taca da Liga)		204_510.csv(2010-11 UEFA Europa League)		
Missing Values(NANs):  {'team 1': '0/16', 'team 2': '0/16'}
Columns that match with input_schema: 
204_405.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
204_510.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,team 1,team 2
0,União da Madeira,Maritimo
1,Penafiel,Academica
2,Santa Clara,União de Leiria
3,Naval,Vitoria de Setúbal
4,Moreirense,Beira-Mar
5,Portimonense,Feirense
6,Estoril,Olhanense
7,Belenenses,Gil Vicente
8,Benfica,Paris Saint-Germain
9,Dynamo Kyiv,Manchester City



5.csv : 204_405.csv(2011-12 Taca da Liga)		204_702.csv(1977-78 Copa del Rey)		
Missing Values(NANs):  {'team 1': '0/50', 'team 2': '0/50'}
Columns that match with input_schema: 
204_405.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
204_702.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,team 1,team 2
0,União da Madeira,Maritimo
1,Penafiel,Academica
2,Santa Clara,União de Leiria
3,Naval,Vitoria de Setúbal
4,Moreirense,Beira-Mar
5,Portimonense,Feirense
6,Estoril,Olhanense
7,Belenenses,Gil Vicente
8,Valladolid Promesas,Elche
9,Pena Sport,Valladolid



6.csv : 204_510.csv(2010-11 UEFA Europa League)		204_702.csv(1977-78 Copa del Rey)		
Missing Values(NANs):  {'team 1': '0/50', 'team 2': '0/50'}
Columns that match with input_schema: 
204_510.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
204_702.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0


Unnamed: 0,team 1,team 2
0,Benfica,Paris Saint-Germain
1,Dynamo Kyiv,Manchester City
2,Twente,Zenit Saint Petersburg
3,CSKA Moscow,Porto
4,PSV Eindhoven,Rangers
5,Bayer Leverkusen,Villarreal
6,Ajax,Spartak Moscow
7,Braga,Liverpool
8,Valladolid Promesas,Elche
9,Pena Sport,Valladolid




Rank  1   :  output_1.csv         completeness score : 1.0000000000000000 		number of rows:  58
Rank  2   :  output_2.csv         completeness score : 1.0000000000000000 		number of rows:  58
Rank  3   :  output_5.csv         completeness score : 1.0000000000000000 		number of rows:  50
Rank  4   :  output_6.csv         completeness score : 1.0000000000000000 		number of rows:  50
Rank  5   :  204_702.csv          completeness score : 1.0000000000000000 		number of rows:  42
Rank  6   :  203_414.csv          completeness score : 1.0000000000000000 		number of rows:  16
Rank  7   :  output_3.csv         completeness score : 1.0000000000000000 		number of rows:  16
Rank  8   :  output_4.csv         completeness score : 1.0000000000000000 		number of rows:  16
Rank  9   :  203_526.csv          completeness score : 1.0000000000000000 		number of rows:  8
Rank  10  :  204_405.csv          completeness score : 1.0000000000000000 		number of rows:  8
Rank  11  :  204_510.csv          comple

In [83]:
s='{"schema": {"team 1": "object", "team 2": "object"}, "categories": ["copa", "cups", "rey", "football", "del", "spanish"]}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
get_matches()

Input Schema :  {'team 1': 'object', 'team 2': 'object'}
Input Categories :  ['copa', 'cups', 'rey', 'football', 'del', 'spanish']
Input Transformations : None
Input Condition based conditions : None

category and schema
Matching Tables
203_414.csv  :  2008-09 Copa del Rey
204_702.csv  :  1977-78 Copa del Rey


203_414.csv(2008-09 Copa del Rey)
Missing Values(NANs score):  {'team 1': '0/16', 'team 2': '0/16'}
Columns that match with input_schema:
 203_414.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,team 1,team 2
0,Real Union,Real Madrid
1,Portugalete,Valencia
2,Ponferradina,Sevilla
3,Orihuela,Atletico
4,Poli Ejido,Villarreal
5,Hercules,Valladolid
6,Rayo Vallecano,Almeria
7,Celta,Espanyol
8,Castellon,Betis
9,Real Murcia,Racing


204_702.csv(1977-78 Copa del Rey)
Missing Values(NANs score):  {'team 1': '0/42', 'team 2': '0/42'}
Columns that match with input_schema:
 204_702.csv :  {'team 1': 'team 1', 'team 2': 'team 2'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,team 1,team 2
0,Valladolid Promesas,Elche
1,Pena Sport,Valladolid
2,Algeciras,Real Madrid
3,Real Madrid Castilla,Sabadell
4,Leganes,Eldense
5,Ciempozuelos,Lleida
6,Alcalá,Valencia
7,Burgos,Huesca
8,Ceuta,Espanyol
9,Atletico Baleares,Cádiz





Unnamed: 0,team 1,team 2
0,Real Union,Real Madrid
1,Portugalete,Valencia
2,Ponferradina,Sevilla
3,Orihuela,Atletico
4,Poli Ejido,Villarreal
5,Hercules,Valladolid
6,Rayo Vallecano,Almeria
7,Celta,Espanyol
8,Castellon,Betis
9,Real Murcia,Racing



Rank  1   :  output_0.csv         completeness score : 1.0000000000000000 		number of rows:  58
Rank  2   :  204_702.csv          completeness score : 1.0000000000000000 		number of rows:  42
Rank  3   :  203_414.csv          completeness score : 1.0000000000000000 		number of rows:  16

Ranking Complete!!



In [99]:
#     , "categories": ["classes", "destroyers", "j", "destroyer", "k", "nclass"]}
s='{"schema": {"name": "object", "builder": "object", "launched": "object", "completed": "object", "fate": "object"}}'
s=s.lower()
s=json.loads(s)
input_schema=s['schema'];input_categories=-1;transformations=-1;stats=-1
if "categories" in s:
    input_categories=s["categories"]
if "transformations" in s:
    transformations=s["transformations"]
if "stats" in s:
    stats=s["stats"]
# display query
print("Input Schema : ",input_schema)
if input_categories==-1:
    print("Input Categories : None")
else:
    print("Input Categories : ",input_categories)
if transformations==-1:
    print("Input Transformations : None")
else:
    print("Input Transformations : ",transformations)
if stats==-1:
    print("Input Condition based conditions : None")
else:
    print("Input Content based Conditions : ",stats)
print()
comp_score={}
no_of_rows={}
no_of_rows={}
get_matches()

Input Schema :  {'name': 'object', 'builder': 'object', 'launched': 'object', 'completed': 'object', 'fate': 'object'}
Input Categories : None
Input Transformations : None
Input Condition based conditions : None

only schema
Matching Tables
203_404.csv  :  J, K and N-class destroyer
204_522.csv  :  Ha-201-class submarine


203_404.csv(J, K and N-class destroyer)
Missing Values(NANs score):  {'pennant number': '0/9', 'name': '0/9', 'builder': '0/9', 'laid down': '0/9', 'launched': '0/9', 'completed': '0/9', 'fate': '0/9'}
Columns that match with input_schema:
 203_404.csv :  {'name': 'name', 'builder': 'builder', 'launched': 'launched', 'completed': 'completed', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  1.0



Unnamed: 0,pennant number,name,builder,laid down,launched,completed,fate
0,F00,Jervis ‡,"Hawthorn Leslie & Company, Hebburn",26 August 1937,9 September 1938,12 May 1939,Sold for scrap 1949
1,F22,Jackal,"John Brown & Company, Clydebank",24 September 1937,25 October 1938,31 March 1939,Bombed off Mersa Matruh on 11 May 1942 and scu...
2,F34,Jaguar,"William Denny & Brothers, Dumbarton",25 November 1937,22 November 1938,12 September 1939,"Torpedoed by German U-boat U.652 off Sollum, 2..."
3,F46,Juno (ex-Jamaica),"Fairfield Shipbuilding & Engineering Company, ...",15 October 1937,8 December 1938,25 August 1939,"Bombed and sunk south of Crete, 21 May 1941"
4,F53,Janus,"Swan Hunter & Wigham Richardson, Wallsend",29 September 1937,11 October 1938,5 August 1939,Torpedoed and sunk off Anzio by German aircraf...
5,F61,Javelin (ex-Kashmir),John Brown,11 October 1937,21 December 1938,10 June 1939,Sold for scrap 1949
6,F72,Jersey,"J. Samuel White, Cowes",1937,26 September 1938,28 April 1939,"Mined off Valletta 2 May 1941, broke in two an..."
7,F85,Jupiter,"Yarrow & Company, Scotstoun",28 September 1937,27 October 1938,25 June 1939,Hit a Dutch mine during the battle of the Java...
8,-,Jubilant,-,-,-,-,"Ordered March 1937, cancelled December 1937"


204_522.csv(Ha-201-class submarine)
Missing Values(NANs score):  {'boat num': '0/44', 'name': '2/44', 'builder': '2/44', 'laid down': '2/44', 'launched': '22/44', 'completed': '34/44', 'fate': '0/44'}
Columns that match with input_schema:
 204_522.csv :  {'name': 'name', 'builder': 'builder', 'launched': 'launched', 'completed': 'completed', 'fate': 'fate'}
Coverage Score :  1.0 	 Completeness Score :  0.7272727272727273



Unnamed: 0,boat num,name,builder,laid down,launched,completed,fate
0,4911,Ha-201,Sasebo Naval Arsenal,01-03-1945,23-04-1945,31-05-1945,Decommissioned 30-11-1945. Scuttled off Goto I...
1,4912,Ha-202,Sasebo Naval Arsenal,01-03-1945,23-04-1945,31-05-1945,Decommissioned 30-11-1945. Scuttled off Goto I...
2,4913,Ha-203,Sasebo Naval Arsenal,05-04-1945,25-05-1945,26-06-1945,Decommissioned 30-11-1945. Scuttled off Goto I...
3,4914,Ha-204,Sasebo Naval Arsenal,05-04-1945,01-06-1945,25-06-1945,Decommissioned 30-11-1945. Grounded at Aburats...
4,4915,Ha-205,Sasebo Naval Arsenal,17-04-1945,14-05-1945,03-07-1945,Decommissioned 30-11-1945. Scuttled at Iyo Nad...
5,4916,Ha-206,Kawasaki-Senshū Shipyard,19-03-1945,10-07-1945,,"Incomplete until the end of war (95%), sunk by..."
6,4917,Ha-207,Sasebo Naval Arsenal,23-04-1945,26-05-1945,14-08-1945,Decommissioned 30-11-1945. Scuttled off Sasebo...
7,4918,Ha-208,Sasebo Naval Arsenal,01-05-1945,26-05-1945,04-08-1945,Decommissioned 30-11-1945. Scuttled off Goto I...
8,4919,Ha-209,Sasebo Naval Arsenal,07-05-1945,31-05-1945,04-08-1945,Decommissioned 30-11-1945. Scrapped August 1946.
9,4920,Ha-210,Sasebo Naval Arsenal,14-05-1945,10-06-1945,11-08-1945,Decommissioned 30-11-1945. Scuttled off Sasebo...





KeyError: '204_522.csv : 203_404.csv'