In [136]:
import json
import math
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from itertools import product

In [137]:
all_schemas={}
with open("final_schema.txt") as f:
    for line in f.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]={"tablename":json_obj["tablename"],"schema":json_obj["schema"],"categories":json_obj["categories"]}

In [139]:
def syn_hyp_generation(word):
    word=word.split()
    i,j=word[0],word[1]
    l1=get_synonyms(i)
    l2=get_synonyms(j)
    l3=list(product(l1,l2))
    print(l1,"\n",l2,"\n",l3)
    print(len(l1),len(l2),len(l3))
        
    
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)
    
    

In [140]:
syn_hyp_generation("employee id")

['employee', 'worker'] 
 ['badge', 'Gem_State', 'Idaho', 'instinct', 'ID', 'positive_identification', 'I.D.', 'id'] 
 [('employee', 'badge'), ('employee', 'Gem_State'), ('employee', 'Idaho'), ('employee', 'instinct'), ('employee', 'ID'), ('employee', 'positive_identification'), ('employee', 'I.D.'), ('employee', 'id'), ('worker', 'badge'), ('worker', 'Gem_State'), ('worker', 'Idaho'), ('worker', 'instinct'), ('worker', 'ID'), ('worker', 'positive_identification'), ('worker', 'I.D.'), ('worker', 'id')]
2 8 16


In [141]:
from nltk.corpus import wordnet as wn
from itertools import product

wordx, wordy = "pole","position"
sem1, sem2 = wn.synsets(wordx,pos=wordnet.NOUN), wn.synsets(wordy,pos=wordnet.NOUN)

maxscore = 0
score=0
for i,j in list(product(*[sem1,sem2])):
    score = i.wup_similarity(j) # Wu-Palmer Similarity
    score = 0.0 if type(score) == None else score
    
    maxscore = score if maxscore < score else maxscore
    print(i,j,score)

Synset('pole.n.01') Synset('position.n.01') 0.4
Synset('pole.n.01') Synset('military_position.n.01') 0.4
Synset('pole.n.01') Synset('position.n.03') 0.125
Synset('pole.n.01') Synset('position.n.04') 0.13333333333333333
Synset('pole.n.01') Synset('status.n.01') 0.14285714285714285
Synset('pole.n.01') Synset('position.n.06') 0.11764705882352941
Synset('pole.n.01') Synset('position.n.07') 0.15384615384615385
Synset('pole.n.01') Synset('position.n.08') 0.4
Synset('pole.n.01') Synset('position.n.09') 0.11764705882352941
Synset('pole.n.01') Synset('placement.n.03') 0.125
Synset('pole.n.01') Synset('situation.n.02') 0.13333333333333333
Synset('pole.n.01') Synset('position.n.12') 0.13333333333333333
Synset('pole.n.01') Synset('side.n.10') 0.13333333333333333
Synset('pole.n.01') Synset('place.n.15') 0.13333333333333333
Synset('pole.n.01') Synset('stead.n.01') 0.10526315789473684
Synset('pole.n.01') Synset('position.n.16') 0.125
Synset('pole.n.02') Synset('position.n.01') 0.4
Synset('pole.n.02')

In [142]:
maxscore,sem1,sem2

(0.7692307692307693,
 [Synset('pole.n.01'),
  Synset('pole.n.02'),
  Synset('pole.n.03'),
  Synset('perch.n.02'),
  Synset('perch.n.03'),
  Synset('pole.n.06'),
  Synset('pole.n.07'),
  Synset('terminal.n.02'),
  Synset('pole.n.09'),
  Synset('pole.n.10')],
 [Synset('position.n.01'),
  Synset('military_position.n.01'),
  Synset('position.n.03'),
  Synset('position.n.04'),
  Synset('status.n.01'),
  Synset('position.n.06'),
  Synset('position.n.07'),
  Synset('position.n.08'),
  Synset('position.n.09'),
  Synset('placement.n.03'),
  Synset('situation.n.02'),
  Synset('position.n.12'),
  Synset('side.n.10'),
  Synset('place.n.15'),
  Synset('stead.n.01'),
  Synset('position.n.16')])

In [143]:
d={}
for i in all_schemas:
    for j in all_schemas[i]["schema"]:
        x=j.lower()
        if x in d:
            d[x]+=1
        else:
            d[x]=1

In [144]:
d1=sorted(d.items(), key=lambda x: x[1], reverse=True)

In [145]:
d1

[('year', 395),
 ('date', 367),
 ('notes', 274),
 ('name', 261),
 ('rank', 199),
 ('opponent', 161),
 ('position', 146),
 ('location', 142),
 ('venue', 134),
 ('title', 125),
 ('num', 120),
 ('team', 118),
 ('result', 112),
 ('total', 110),
 ('attendance', 108),
 ('competition', 108),
 ('time', 95),
 ('nation', 86),
 ('country', 85),
 ('points', 85),
 ('player', 82),
 ('gold', 78),
 ('silver', 78),
 ('bronze', 78),
 ('score', 77),
 ('season', 70),
 ('nationality', 67),
 ('no ', 64),
 ('event', 61),
 ('type', 58),
 ('place', 56),
 ('winner', 55),
 ('round', 54),
 ('pos', 54),
 ('party', 53),
 ('driver', 52),
 ('no', 51),
 ('role', 49),
 ('laps', 47),
 ('city', 45),
 ('unnamed: 0', 45),
 ('week', 40),
 ('album', 39),
 ('division', 39),
 ('time/retired', 38),
 ('grid', 38),
 ('tournament', 38),
 ('site', 37),
 ('club', 35),
 ('record', 35),
 ('number', 33),
 ('film', 31),
 ('athlete', 31),
 ('constructor', 28),
 ('surface', 28),
 ('pos ', 27),
 ('age', 25),
 ('county', 25),
 ('length', 25

In [147]:
def generate_column_knowledge_graph():
    column_k_graph={}
    for fname in all_schemas:
        for col,dtype in all_schemas[fname]["schema"].items():
            if col not in column_k_graph:
                column_k_graph[col]={"direct":[fname],"similar":{},"d_type":dtype}
            else:
                column_k_graph[col]["direct"].append(fname)
    return column_k_graph

col_k_graph=generate_column_knowledge_graph()

In [150]:
len(col_k_graph["date of appointment"]["direct"])

4

In [152]:
count=0
for i,j in d1:
    if j==1:
        count+=1
        print(i)
print(count)

num of accidents
year signed
num albums released under bad boy
target city/market
disappeared
found
peak chart positions ire
film 1
2012/13 total cost (million)
comparison with 2011/12 (million)
city served / location
club performance season norway
club performance club norway
club performance league norway
league apps league
league goals league
cup apps norwegian cup
cup goals norwegian cup
total apps total
total goals total
breeder
work/artist
area size (km2)
density per km2
preceded by alfred scott
member of parliament for ashton under lyne 1910 1916
succeeded by albert stanley
new municipality
old municipalities
encoding
implied probability
current officer
duration (months)
duration (years)
afrikaans
ipa
dutch
english
german
communities served
examples
cancers
gene functions
peak chart positions bel (fl)
peak chart positions bel (wa)
peak chart positions fra
certifications (sales thresholds)
county(ies)
population (2000 census)
incorporation date
crossing
chart positions spa
headli

jornada or other
distribution
x86
x86 64
ia64
ppc
ppc64
sparc32
sparc64
arm
hppa
mips
sh
s390
s390x
alpha
m68k
hot black singles
club play singles
ndeg
us modern rock
us mainstream rock
joining year
country:
gold:
silver:
bronze:
total:
transmission
years participated
1st (m)
2nd (m)
matthew
luke
john
percentage won overall
sigma points
qual 
moto
laetare medalist
laetare medalist 1
definition
organisation
ownership
population density (/mi2)
land area (mi2)
water area (mi2)
water area (km2)
catalog number
winning club
punt return attempts
punt return yards
punts returned for touchdown
punts fair caught
longest punt return
kickoff attempts
kickoff return yards
kickoffs returned for touchdown
kickoffs fair caught
longest kickoff return
joined mac
brown
nixon
wyckoff
windspeed
compression
torque
turbo
stock boost
turbine
compressor
political rights
civil liberties
fin
st
bonus
gdp (usd bln, 2012)
gdp (usd bln, 2017)
gdp (usd ppp bln, 2012)
gdp (usd ppp bln, 2017)
per capita (usd, 2012)
pe

priests
monastics
monasteries
parishes
time (m:s)
time (m:s) 1
parking
transferred from
cp
fp
placings
minutes
2 points
2 points 1
3 points
3 points 1
free throws 1
rebounds 1
rebounds 2
turnovers
blocked shots
personal fouls
fouls drawn
district office
wheel arr 
original owner
current location
date to
term as a deputy judge
manufactuer
storage
information
forstall 2003 est 
official population
mlb season
team season
currency
target date
term expires dec 31
decimal32
decimal64
decimal128
decimal(32k)
drafted by
boat builder
administrative centre
performer 1
operators
named trains
year begun
year discontinued
year inducted
name in english
name in turkish
depth
location (districts and/or provinces)
span
total caps
barrel length
field goals
amanda
bernie
javine h
julia
michelle
conservative
councillors
labour
councillors
independent
councillors
liberal
councillors
report 1
report 2
week of
quarterfinalists
top scorer (league)
ncbi accession num
ncbi name
sequence identity
sequenc

responsible ministers
crown entities
monitoring departments
category / type
empowering legislation
last win
last losing final
by race
white
black
aian*
asian
nhpi*
first held
earliest attendance figure known
1998 attendance
2000 attendance
2002 attendance
2004 attendance
2006 attendance
2008 attendance
2010 attendance
2012 attendance
household rating
viewers (millions)
celebrity guests
timeslot
18 49
weekly rank
first switch
second switch
ghost island
2961
