In [1]:
import json
import pandas as pd
import numpy as np
import math
from nltk.corpus import wordnet

In [2]:
ip=open("final_schema.txt",'r')
op1=open("column_and_category_match_scores.txt",'w')

In [3]:
all_schemas=[]
for line in ip.readlines():
    json_obj=json.loads(line)
    all_schemas.append(json_obj)

In [4]:
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)

def generate_list_ontology(list1):
    ontology={}
    for word in list1:
        ontology[word]=get_synonyms(word)
    return ontology

def generate_schema_ontology(input_schema):
    ontology={}
    for col in input_schema:
        ontology[col]=get_synonyms(col)
    return ontology

In [5]:
def cos_sim(list1, list2):
    terms = set(list1).union(list2)
    intersect = set(list1) & set(list2)
    others = (set(list1)-intersect).union(set(list2)-intersect)
    product=0
    for word in terms:
        if word in intersect:
            product+=1
    l1mag = math.sqrt(len(list1))
    l2mag = math.sqrt(len(list2))
    if len(list1)==0 or len(list2)==0:
        return 0.0
    else:
        return product / (l1mag * l2mag)

In [6]:
length=len(all_schemas)
column_cos_sim={}
category_cos_sim={}
for i in range(length-1):
    a = all_schemas[i]
    onto_sch = generate_schema_ontology(a["schema"])
    onto_cat = generate_list_ontology(a["categories"])
    for j in range(i+1,length):
        b=all_schemas[j]
        #finding schema(column) cosine similarity
        la=[]
        lb=[]
        for col,dat_type in b["schema"].items():
            for col_a in a["schema"]:
                flag=0
                la.append(col_a)
                if (col_a==col) and (a["schema"][col_a]==dat_type):
                    lb.append(col_a)
                    flag=1
                else:
                    for x in onto_sch[col_a]:
                        if(col == x) and (a["schema"][col_a]==dat_type):
                            lb.append(col_a)
                            flag=1
                if flag!=1:
                    lb.append(col)
        la=set(la)
        la=list(la)
        lb=set(lb)
        lb=list(lb)
        col_sim = cos_sim(la,lb)
#         if(col_sim!=0.0):
#             txt="table 1: "+a["filename"]+"\t table 2: "+b["filename"]+"\t\t\t"+str(col_sim)+'\n'
#         op1.write(txt)
        
        #finding category cosine similarity
        lc=[]
        ld=[]
        for cat_b in b["categories"]:
            for cat_a in a["categories"]:
                lc.append(cat_a)
                if (cat_b == cat_a) or (cat_b in onto_cat[cat_a]):
                    ld.append(cat_a)
                    flag=1
            if flag!=1:
                ld.append(cat_b)
        lc=set(lc)
        lc=list(lc)
        ld=set(ld)
        ld=list(ld)
        cat_sim = cos_sim(lc,ld)
#         if cat_sim!=0.0:
#             txt="table 1: "+a["filename"]+"\t table 2: "+b["filename"]+"\t\t\t"+str(cat_sim)+'\n'
#         op2.write(txt)
        
        #storing in a file
        if cat_sim!=0.0 and col_sim!=0.0:
            txt1=a["filename"]+' : '+b["filename"]
            column_cos_sim[txt1]=col_sim
            category_cos_sim[txt1]=cat_sim
            
json_op={}
json_op["column_similarity"]=column_cos_sim
json_op["category_similarity"]=category_cos_sim
with open("cos_similarity.txt",'w') as op_file:
    json.dump(json_op,op_file)

In [7]:
for entry in column_cos_sim:
    op1.write(entry+'\t\t'+str(column_cos_sim[entry])+'\t\t'+str(category_cos_sim[entry])+'\n')

In [8]:
op1.close()