In [1]:
import json
import math
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from IPython.display import display

In [2]:
#preloading all schemas and categories
all_schemas={}
all_categories={}
with open("final_schema.txt") as ip_file:
    for line in ip_file.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]=json_obj["schema"]
        all_categories[json_obj["filename"]]=json_obj["categories"]
        
#preloading the candidate keys
with open("Candidate_key_dict.txt",'r') as ip_file:
    cand_key=json.load(ip_file)

#preloading column and category similarity values of tables
with open("cos_similarity.txt",'r') as ip_file:
    json_object=json.load(ip_file)
col_sim = json_object["column_similarity"]
cat_sim = json_object["category_similarity"]

In [3]:
# all functions needed to generate ontologies
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)

def generate_list_ontology(list1):
    ontology={}
    for word in list1:
        ontology[word]=get_synonyms(word)
    return ontology

def generate_schema_ontology(input_schema):
    ontology={}
    for col in input_schema:
        ontology[col]=get_synonyms(col)
    return ontology

In [4]:
# to generate cos similarity between two lists
def cos_sim(list1, list2):
    terms = set(list1).union(list2)
    intersect = set(list1) & set(list2)
    others = (set(list1)-intersect).union(set(list2)-intersect)
    product=0
    for word in terms:
        if word in intersect:
            product+=1
    l1mag = math.sqrt(len(list1))
    l2mag = math.sqrt(len(list2))
    if len(list1)==0 or len(list2)==0:
        return 0.0
    else:
        return product / (l1mag * l2mag)

In [5]:
# if input has only schema(columns and their dataypes)
def col_only_list(input_schema,input_sch_onto):
    possible_tables={}
    for file in all_schemas:
        schema=all_schemas[file]
        for col,d_type in schema.items():
            if (col in input_schema) and (input_schema[col]==d_type):
                if file in possible_tables:
                    possible_tables[file].append(col)
                else:
                    possible_tables[file]=[]
                    possible_tables[file].append(col)
            else:
                for a in input_sch_onto:
                    if (col in input_sch_onto[a]) and (input_schema[a]==d_type):
                        if file in possible_tables:
                            possible_tables[file].append(a)
                        else:
                            possible_tables[file]=[]
                            possible_tables[file].append(a)
    return possible_tables

In [6]:
# if input has categories as well as schema(columns and their dataypes)
# we consider it a match under the assumption that at least 75% category match exists
def cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto):
    possible_tables={}
    for file in all_categories:
        cat_list=[]
        category=all_categories[file]
        for cat in category:
            if cat in input_categories:
                cat_list.append(cat)
            else:
                for cat1 in input_cat_onto:
                    if cat in input_cat_onto[cat1]:
                        cat_list.append(cat1)
        cos_val=cos_sim(cat_list,input_categories)
        if cos_val > 0.75 :
            schema=all_schemas[file]
            for col,d_type in schema.items():
                if (col in input_schema) and (input_schema[col]==d_type):
                    if file in possible_tables:
                        possible_tables[file].append(col)
                    else:
                        possible_tables[file]=[]
                        possible_tables[file].append(col)
                else:
                    for a in input_sch_onto:
                        if (col in input_sch_onto[a]) and (input_schema[a]==d_type):
                            if file in possible_tables:
                                possible_tables[file].append(a)
                            else:
                                possible_tables[file]=[]
                                possible_tables[file].append(a)
    return possible_tables

In [7]:
#For the given input file names , find the columns that match if any.
#If no colmns match , then don't do anything.
#If columns match , then check if the columns are present in the list of candidate keys.
#If not present in list of candidate keys , just check data.
#If present in list of candidate keys and data of those columns match , append the others columns to any of the table to extend the table schema.
#If present in list of candidate keys but data dont match , then increse volume by adding the rows to one of the table , but will create a lot of NULL values. Undesirable.
#If all the columns match , then to each table just add another column with the table name and append the two tables.

def merge(fname1,fname2):
    matching_columns = {}  #resultant list containing the matching columns.
    #getting schema for tables.
    f1 = all_schemas[fname1]
    f2 = all_schemas[fname2]
    #Retrieving the column names and generating the ontologies for one of the tables columns
    f1_cols = list(f1)
    f1_onto = generate_list_ontology(f1_cols)
    #finding columns that match.     
    for col , d_type in f2.items():
        if (col in f1_cols) and (d_type==f1[col]):
            matching_columns[col] = col
        else:
            for col_t1 in f1_onto:
                if (col in f1_onto[col_t1]) and (d_type==f1[col_t1]):
                    matching_columns[col_t1] = col
                    break
    #Now the varaible matching_columns contains a list of names of columns that match between the two tables.
    t1 = cand_key[fname1]
    t2 = cand_key[fname2]
    mat_cols={}
    for key , value in matching_columns.items():
        if(key in t1 or value in t2):
            mat_cols[key]=value
    return merge_tables(fname1,fname2,mat_cols)

In [8]:
def merge_tables(fname1,fname2,cols):
    t1 = pd.read_csv(fname1)
    t2 = pd.read_csv(fname2)
    display(t1)
    display(t2)
    print(fname1+' and '+fname2+' gives : ')
    l=len(cols)
    if l!=0:
        for name1,name2 in cols.items():
            t2.rename(columns = {name2:name1},inplace=True)
    t3=t1.merge(t2,how='outer')
    display(t3)
    return t3

In [10]:
def check_possible_matches():
    with open("input1.txt",'r') as ip_file:
        json_object=json.load(ip_file)
    input_schema=json_object["schema"]
    input_sch_onto=generate_schema_ontology(input_schema)
    if "categories" in json_object:
        print('category and schema')
        input_categories=json_object["categories"]
        input_cat_onto=generate_list_ontology(input_categories)
        possible_tables=cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto)
    else:
        print('only schema')
        possible_tables=col_only_list(input_schema,input_sch_onto)
    print("possible tables: ",possible_tables)
    matching_tables=[]
    for i in possible_tables:
        cos_val=cos_sim(possible_tables[i],list(input_schema))
        if cos_val>0.75:
            matching_tables.append(i)
    return matching_tables

In [11]:
def get_matches():
    matching_tables=check_possible_matches()
    op_str1='output_folder1/'
    op_str2='.csv'
    x=len(matching_tables)
    print(matching_tables)
    if x==0:
        with open("output_folder1/results.txt",'a') as f:
            print("NO MATCHES FOUND",file=f)
    elif x==1:
        with open("output_folder1/results.txt",'a') as f:
            print(matching_tables[0],file=f)
    else:
        ctr=0
        for i in range(x-1):
            for j in range(i+1,x):
                a=matching_tables[i]+' : '+matching_tables[j]
                b=matching_tables[j]+' : '+matching_tables[i]
                if (a in cat_sim):
                    if cat_sim[a]>=.50 and col_sim[a]>=0.50:
                        res=merge(matching_tables[i],matching_tables[j])
                        ctr+=1
                        op_string=op_str1+str(ctr)+op_str2
                        res.to_csv(op_string,sep=',', index=False)
                        with open("output_folder1/results.txt",'a') as f:
                            print(matching_tables[i]+' and '+matching_tables[j],file=f)
                elif (b in cat_sim):
                    if cat_sim[b]>=.50 and col_sim[b]>=0.50:
                        res=merge(matching_tables[i],matching_tables[j])
                        ctr+=1
                        op_string=op_str1+str(ctr)+op_str2
                        res.to_csv(op_string,sep=',', index=False)
                        with open("output_folder1/results.txt",'a') as f:
                            print(matching_tables[i]+' and '+matching_tables[j],file=f)
                else:
                    pass
        with open("output_folder1/results.txt",'a') as f:
            for i in matching_tables:
                print(i,file=f)

In [12]:
get_matches()

category and schema
possible tables:  {'201_26.csv': ['Club'], '203_125.csv': ['Club'], '203_242.csv': ['Season', 'Competition', 'Round', 'Club'], '203_269.csv': ['Season', 'Competition', 'Round', 'Club', 'Home'], '203_5.csv': ['Competition', 'Round', 'Home'], '203_741.csv': ['Season', 'Competition', 'Round', 'Home'], '203_776.csv': ['Season', 'Competition', 'Round', 'Club', 'Home'], '203_801.csv': ['Season'], '203_843.csv': ['Competition'], '204_542.csv': ['Season', 'Competition', 'Round', 'Club', 'Home'], '204_790.csv': ['Round'], '204_885.csv': ['Season', 'Competition', 'Round', 'Home'], '204_985.csv': ['Competition', 'Round', 'Club'], '204_99.csv': ['Club']}
['203_242.csv', '203_269.csv', '203_5.csv', '203_741.csv', '203_776.csv', '204_542.csv', '204_885.csv', '204_985.csv']


Unnamed: 0,Season,Competition,Round,Club,1st Leg,2nd Leg
0,,UEFA Cup,QR,CSKA Sofia,,
1,2003.0,UEFA Intertoto Cup,1R,Omagh Town,,
2,2003.0,UEFA Intertoto Cup,2R,Cibalia,,
3,,UEFA Cup,1Q,Nistru Otaci,,
4,,UEFA Champions League,1Q,Široki Brijeg,,
5,2007.0,UEFA Intertoto Cup,1R,Ararat Yerevan,,
6,2007.0,UEFA Intertoto Cup,2R,Chornomorets Odesa,,
7,2008.0,UEFA Intertoto Cup,1R,Cracovia Krakow,,
8,2008.0,UEFA Intertoto Cup,2R,Sturm Graz,,
9,,UEFA Europa League,2Q,Ventspils,,


Unnamed: 0,Season,Competition,Round,Club,Home,Away,Aggregate
0,,UEFA Cup Winners' Cup,Qualifying round,Kotayk Abovian,,,
1,,UEFA Cup Winners' Cup,First round,Barcelona,,,
2,,UEFA Cup,Second qualifying round,Maccabi Petah Tikva,,,
3,,UEFA Europa League,Second qualifying round,Floriana,,,
4,,UEFA Europa League,Third qualifying round,Mladá Boleslav,,,
5,,UEFA Europa League,Play-off round,Rosenborg,,,
6,,UEFA Europa League,Group stage (J),Maccabi Haifa,,,4th place
7,,UEFA Europa League,Group stage (J),Steaua Bucureşti,,,4th place
8,,UEFA Europa League,Group stage (J),Schalke 04,,,4th place


203_242.csv and 203_269.csv gives : 


Unnamed: 0,Season,Competition,Round,Club,1st Leg,2nd Leg,Home,Away,Aggregate
0,,UEFA Cup,QR,CSKA Sofia,,,,,
1,2003.0,UEFA Intertoto Cup,1R,Omagh Town,,,,,
2,2003.0,UEFA Intertoto Cup,2R,Cibalia,,,,,
3,,UEFA Cup,1Q,Nistru Otaci,,,,,
4,,UEFA Champions League,1Q,Široki Brijeg,,,,,
5,2007.0,UEFA Intertoto Cup,1R,Ararat Yerevan,,,,,
6,2007.0,UEFA Intertoto Cup,2R,Chornomorets Odesa,,,,,
7,2008.0,UEFA Intertoto Cup,1R,Cracovia Krakow,,,,,
8,2008.0,UEFA Intertoto Cup,2R,Sturm Graz,,,,,
9,,UEFA Europa League,2Q,Ventspils,,,,,


Unnamed: 0,Season,Competition,Round,Club,1st Leg,2nd Leg
0,,UEFA Cup,QR,CSKA Sofia,,
1,2003.0,UEFA Intertoto Cup,1R,Omagh Town,,
2,2003.0,UEFA Intertoto Cup,2R,Cibalia,,
3,,UEFA Cup,1Q,Nistru Otaci,,
4,,UEFA Champions League,1Q,Široki Brijeg,,
5,2007.0,UEFA Intertoto Cup,1R,Ararat Yerevan,,
6,2007.0,UEFA Intertoto Cup,2R,Chornomorets Odesa,,
7,2008.0,UEFA Intertoto Cup,1R,Cracovia Krakow,,
8,2008.0,UEFA Intertoto Cup,2R,Sturm Graz,,
9,,UEFA Europa League,2Q,Ventspils,,


Unnamed: 0,Season,Competition,Round,Club,Home,Away
0,,UEFA Cup Winners' Cup,1R,HJK Helsinki,,
1,,UEFA Cup,1R,FC Barcelona,,
2,,UEFA Cup,1R,FK Partizan Beograd,,
3,,UEFA Cup,2R,Wismut Aue,,
4,,UEFA Cup,1/16,FC Barcelona,,
5,,UEFA Cup Winners' Cup,1R,Lech Poznań,,
6,,UEFA Cup Winners' Cup,1R,Olympiacos Piraeus,,
7,,UEFA European Cup,1R,IFK Göteborg,,
8,,UEFA Cup Winners' Cup,QR,Humenne,,
9,,UEFA Europa League,2QR,Motherwell,,


203_242.csv and 203_776.csv gives : 


Unnamed: 0,Season,Competition,Round,Club,1st Leg,2nd Leg,Home,Away
0,,UEFA Cup,QR,CSKA Sofia,,,,
1,2003.0,UEFA Intertoto Cup,1R,Omagh Town,,,,
2,2003.0,UEFA Intertoto Cup,2R,Cibalia,,,,
3,,UEFA Cup,1Q,Nistru Otaci,,,,
4,,UEFA Champions League,1Q,Široki Brijeg,,,,
5,2007.0,UEFA Intertoto Cup,1R,Ararat Yerevan,,,,
6,2007.0,UEFA Intertoto Cup,2R,Chornomorets Odesa,,,,
7,2008.0,UEFA Intertoto Cup,1R,Cracovia Krakow,,,,
8,2008.0,UEFA Intertoto Cup,2R,Sturm Graz,,,,
9,,UEFA Europa League,2Q,Ventspils,,,,


Unnamed: 0,Season,Competition,Round,Club,1st Leg,2nd Leg
0,,UEFA Cup,QR,CSKA Sofia,,
1,2003.0,UEFA Intertoto Cup,1R,Omagh Town,,
2,2003.0,UEFA Intertoto Cup,2R,Cibalia,,
3,,UEFA Cup,1Q,Nistru Otaci,,
4,,UEFA Champions League,1Q,Široki Brijeg,,
5,2007.0,UEFA Intertoto Cup,1R,Ararat Yerevan,,
6,2007.0,UEFA Intertoto Cup,2R,Chornomorets Odesa,,
7,2008.0,UEFA Intertoto Cup,1R,Cracovia Krakow,,
8,2008.0,UEFA Intertoto Cup,2R,Sturm Graz,,
9,,UEFA Europa League,2Q,Ventspils,,


Unnamed: 0,Season,Competition,Round,Club,Home,Away,Aggregate
0,,UEFA Europa League,Q3,Motherwell,,,
1,,UEFA Europa League,Q1,Neath,,,
2,,UEFA Europa League,Q2,Ferencváros,,,
3,,UEFA Europa League,Q3,Elfsborg,,,
4,,UEFA Europa League,Play-off,AZ,,,
5,,UEFA Europa League,Q2,Tirana,,,
6,,UEFA Europa League,Q3,APOEL,,,


203_242.csv and 204_542.csv gives : 


Unnamed: 0,Season,Competition,Round,Club,1st Leg,2nd Leg,Home,Away,Aggregate
0,,UEFA Cup,QR,CSKA Sofia,,,,,
1,2003.0,UEFA Intertoto Cup,1R,Omagh Town,,,,,
2,2003.0,UEFA Intertoto Cup,2R,Cibalia,,,,,
3,,UEFA Cup,1Q,Nistru Otaci,,,,,
4,,UEFA Champions League,1Q,Široki Brijeg,,,,,
5,2007.0,UEFA Intertoto Cup,1R,Ararat Yerevan,,,,,
6,2007.0,UEFA Intertoto Cup,2R,Chornomorets Odesa,,,,,
7,2008.0,UEFA Intertoto Cup,1R,Cracovia Krakow,,,,,
8,2008.0,UEFA Intertoto Cup,2R,Sturm Graz,,,,,
9,,UEFA Europa League,2Q,Ventspils,,,,,


Unnamed: 0,Season,Competition,Round,Club,1st Leg,2nd Leg
0,,UEFA Cup,QR,CSKA Sofia,,
1,2003.0,UEFA Intertoto Cup,1R,Omagh Town,,
2,2003.0,UEFA Intertoto Cup,2R,Cibalia,,
3,,UEFA Cup,1Q,Nistru Otaci,,
4,,UEFA Champions League,1Q,Široki Brijeg,,
5,2007.0,UEFA Intertoto Cup,1R,Ararat Yerevan,,
6,2007.0,UEFA Intertoto Cup,2R,Chornomorets Odesa,,
7,2008.0,UEFA Intertoto Cup,1R,Cracovia Krakow,,
8,2008.0,UEFA Intertoto Cup,2R,Sturm Graz,,
9,,UEFA Europa League,2Q,Ventspils,,


Unnamed: 0,Season,Competition,Round,Club,Home,Away,Aggregate
0,2003,UEFA Intertoto Cup,R1,WIT Georgia,1-0,1-2,2-2
1,,,R2,FK Pobeda,2-1,1-1,3-2
2,,,R3,Tobol Kostanay,3-0,1-0,4-0
3,,,Semifinals,Werder Bremen,4-0,1-1,5-1
4,,,Finals,FC Schalke 04,0-2,0-0,0-2
5,2004/05,UEFA Cup,Q2,Zenit St. Petersburg,3-1,0-2,3-3
6,2005/06,UEFA Cup,Q2,Zenit St. Petersburg,2-2,1-1,3-3
7,2006/07,UEFA Cup,1,Livorno,0-1,0-2,0-3


203_242.csv and 204_985.csv gives : 


ValueError: You are trying to merge on float64 and object columns. If you wish to proceed you should use pd.concat