In [13]:
import json
import math
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from IPython.display import display

In [14]:
#preloading all schemas and categories
all_schemas={}
all_categories={}
with open("final_schema.txt") as ip_file:
    for line in ip_file.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]=json_obj["schema"]
        all_categories[json_obj["filename"]]=json_obj["categories"]
        
#preloading the candidate keys
with open("Candidate_key_dict.txt",'r') as ip_file:
    cand_key=json.load(ip_file)

#preloading column and category similarity values of tables
with open("cos_similarity.txt",'r') as ip_file:
    json_object=json.load(ip_file)
col_sim = json_object["column_similarity"]
cat_sim = json_object["category_similarity"]

In [15]:
# all functions needed to generate ontologies
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)

def generate_list_ontology(list1):
    ontology={}
    for word in list1:
        ontology[word]=get_synonyms(word)
    return ontology

def generate_schema_ontology(input_schema):
    ontology={}
    for col in input_schema:
        ontology[col]=get_synonyms(col)
    return ontology

In [16]:
# to generate cos similarity between two lists
def cos_sim(list1, list2):
    terms = set(list1).union(list2)
    intersect = set(list1) & set(list2)
    others = (set(list1)-intersect).union(set(list2)-intersect)
    product=0
    for word in terms:
        if word in intersect:
            product+=1
    l1mag = math.sqrt(len(list1))
    l2mag = math.sqrt(len(list2))
    if len(list1)==0 or len(list2)==0:
        return 0.0
    else:
        return product / (l1mag * l2mag)

In [17]:
# if input has only schema(columns and their dataypes)
def col_only_list(input_schema,input_sch_onto):
    possible_tables={}
    for file in all_schemas:
        schema=all_schemas[file]
        for col,d_type in schema.items():
            if (col in input_schema) and (input_schema[col]==d_type):
                if file in possible_tables:
                    possible_tables[file].append(col)
                else:
                    possible_tables[file]=[]
                    possible_tables[file].append(col)
            else:
                for a in input_sch_onto:
                    if (col in input_sch_onto[a]) and (input_schema[a]==d_type):
                        if file in possible_tables:
                            possible_tables[file].append(a)
                        else:
                            possible_tables[file]=[]
                            possible_tables[file].append(a)
    return possible_tables

In [18]:
# if input has categories as well as schema(columns and their dataypes)
# we consider it a match under the assumption that at least 75% category match exists
def cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto):
    possible_tables={}
    for file in all_categories:
        cat_list=[]
        category=all_categories[file]
        for cat in category:
            if cat in input_categories:
                cat_list.append(cat)
            else:
                for cat1 in input_cat_onto:
                    if cat in input_cat_onto[cat1]:
                        cat_list.append(cat1)
        cos_val=cos_sim(cat_list,input_categories)
        if cos_val > 0.75 :
            schema=all_schemas[file]
            for col,d_type in schema.items():
                if (col in input_schema) and (input_schema[col]==d_type):
                    if file in possible_tables:
                        possible_tables[file].append(col)
                    else:
                        possible_tables[file]=[]
                        possible_tables[file].append(col)
                else:
                    for a in input_sch_onto:
                        if (col in input_sch_onto[a]) and (input_schema[a]==d_type):
                            if file in possible_tables:
                                possible_tables[file].append(a)
                            else:
                                possible_tables[file]=[]
                                possible_tables[file].append(a)
    return possible_tables

In [19]:
#For the given input file names , find the columns that match if any.
#If no colmns match , then don't do anything.
#If columns match , then check if the columns are present in the list of candidate keys.
#If not present in list of candidate keys , just check data.
#If present in list of candidate keys and data of those columns match , append the others columns to any of the table to extend the table schema.
#If present in list of candidate keys but data dont match , then increse volume by adding the rows to one of the table , but will create a lot of NULL values. Undesirable.
#If all the columns match , then to each table just add another column with the table name and append the two tables.

def merge(fname1,fname2):
    matching_columns = {}  #resultant list containing the matching columns.
    #getting schema for tables.
    f1 = all_schemas[fname1]
    f2 = all_schemas[fname2]
    #Retrieving the column names and generating the ontologies for one of the tables columns
    f1_cols = list(f1)
    f1_onto = generate_list_ontology(f1_cols)
    #finding columns that match.     
    for col , d_type in f2.items():
        if(col in f1_cols):
            matching_columns[col] = col
        else:
            for col_t1 in f1_onto:
                if(col in f1_onto[col_t1]):
                    matching_columns[col_t1] = col
                    break
    #Now the varaible matching_columns contains a list of names of columns that match between the two tables.
    t1 = cand_key[fname1]
    t2 = cand_key[fname2]
    mat_cols={}
    for key , value in matching_columns.items():
        if(key in t1 or value in t2):
            mat_cols[key]=value
    return merge_tables(fname1,fname2,mat_cols)

In [20]:
def merge_tables(fname1,fname2,cols):
    t1 = pd.read_csv(fname1)
    t2 = pd.read_csv(fname2)
    print(fname1+' and '+fname2+' gives : ')
    l=len(cols)
    if l!=0:
        for name1,name2 in cols.items():
            t2.rename(columns = {name2:name1},inplace=True)
    t3=t1.merge(t2,how='outer')
    display(t3)
    return t3

In [21]:
def check_possible_matches():
    with open("input.txt",'r') as ip_file:
        json_object=json.load(ip_file)
    input_schema=json_object["schema"]
    input_sch_onto=generate_schema_ontology(input_schema)
    if "categories" in json_object:
        print('category and schema')
        input_categories=json_object["categories"]
        input_cat_onto=generate_list_ontology(input_categories)
        possible_tables=cat_and_col_list(input_categories,input_cat_onto,input_schema,input_sch_onto)
    else:
        print('only schema')
        possible_tables=col_only_list(input_schema,input_sch_onto)
    matching_tables=[]
    for i in possible_tables:
        cos_val=cos_sim(possible_tables[i],list(input_schema))
        if cos_val>0.75:
            matching_tables.append(i)
    return matching_tables

In [22]:
def get_matches():
    matching_tables=check_possible_matches()
    op_str1='output_folder/'
    op_str2='.csv'
    f=open("output_folder/results.txt",'w')
    x=len(matching_tables)
    print(matching_tables)
    if x==0:
        print("NO MATCHES FOUND",file=f)
    elif x==1:
        print(matching_tables[0],file=f)
    else:
        ctr=0
        for i in range(x-1):
            for j in range(i+1,x):
                a=matching_tables[i]+' : '+matching_tables[j]
                b=matching_tables[j]+' : '+matching_tables[i]
                if (a in cat_sim):
                    if cat_sim[a]>=.50 and col_sim[a]>=0.55:
                        res=merge(matching_tables[i],matching_tables[j])
                        ctr+=1
                        op_string=op_str1+str(ctr)+op_str2
                        res.to_csv(op_string,sep=',', index=False)
                        print(matching_tables[i]+' and '+matching_tables[j],file=f)
                elif (b in cat_sim):
                    if cat_sim[b]>=.75 and col_sim[b]>=0.50:
                        res=merge(matching_tables[i],matching_tables[j])
                        ctr+=1
                        op_string=op_str1+str(ctr)+op_str2
                        res.to_csv(op_string,sep=',', index=False)
                        print(matching_tables[i]+' and '+matching_tables[j],file=f)
                else:
                    pass
        for i in matching_tables:
            print(i,file=f)
    f.close()

In [23]:
get_matches()

category and schema
['202_117.csv', '203_268.csv']
202_117.csv and 203_268.csv gives : 


Unnamed: 0,Date,Ship,Nationality,Tonnage,Fate,Name,Tonnage (GRT)
0,19 August 1941,SS Aguila,United Kingdom,3255.0,Sunk,,
1,27 November 1941,HMAS Parramatta,Royal Australian Navy,1060.0,Sunk,,
2,23 December 1941,SS Shuntien,United Kingdom,3059.0,Sunk,,
3,26 December 1941,SS Warszawa,Poland,2487.0,Sunk,,
4,10 June 1942,MV Athene,Norway,4681.0,Sunk,,
5,10 June 1942,SS Brambleleaf,United Kingdom,5917.0,Damaged,,
6,7 October 1941,,United Kingdom,,Damaged,Svend Foyn,14795.0
7,16 February 1942,,Venezuela,,Sunk,Monagas,2650.0
8,16 February 1942,,United Kingdom,,Sunk,San Nicholas,2391.0
9,16 February 1942,,United Kingdom,,Sunk,Tia Juana,2395.0
