In [136]:
import json
import math
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from itertools import product

In [137]:
all_schemas={}
with open("final_schema.txt") as f:
    for line in f.readlines():
        json_obj=json.loads(line)
        all_schemas[json_obj["filename"]]={"tablename":json_obj["tablename"],"schema":json_obj["schema"],"categories":json_obj["categories"]}

In [139]:
def syn_hyp_generation(word):
    word=word.split()
    i,j=word[0],word[1]
    l1=get_synonyms(i)
    l2=get_synonyms(j)
    l3=list(product(l1,l2))
    print(l1,"\n",l2,"\n",l3)
    print(len(l1),len(l2),len(l3))
        
    
def get_synonyms(word):
    meanings=set()
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for lemma in synset.lemmas():
            meanings.add(lemma.name())
    for synset in wordnet.synsets(word,pos=wordnet.NOUN):
        for hypernym in synset.hypernyms():
            meanings.add(hypernym.lemma_names()[0])
    meanings.add(word)
    return list(meanings)
    
    

In [140]:
syn_hyp_generation("employee id")

['employee', 'worker'] 
 ['badge', 'Gem_State', 'Idaho', 'instinct', 'ID', 'positive_identification', 'I.D.', 'id'] 
 [('employee', 'badge'), ('employee', 'Gem_State'), ('employee', 'Idaho'), ('employee', 'instinct'), ('employee', 'ID'), ('employee', 'positive_identification'), ('employee', 'I.D.'), ('employee', 'id'), ('worker', 'badge'), ('worker', 'Gem_State'), ('worker', 'Idaho'), ('worker', 'instinct'), ('worker', 'ID'), ('worker', 'positive_identification'), ('worker', 'I.D.'), ('worker', 'id')]
2 8 16


In [141]:
from nltk.corpus import wordnet as wn
from itertools import product

wordx, wordy = "pole","position"
sem1, sem2 = wn.synsets(wordx,pos=wordnet.NOUN), wn.synsets(wordy,pos=wordnet.NOUN)

maxscore = 0
score=0
for i,j in list(product(*[sem1,sem2])):
    score = i.wup_similarity(j) # Wu-Palmer Similarity
    score = 0.0 if type(score) == None else score
    
    maxscore = score if maxscore < score else maxscore
    print(i,j,score)

Synset('pole.n.01') Synset('position.n.01') 0.4
Synset('pole.n.01') Synset('military_position.n.01') 0.4
Synset('pole.n.01') Synset('position.n.03') 0.125
Synset('pole.n.01') Synset('position.n.04') 0.13333333333333333
Synset('pole.n.01') Synset('status.n.01') 0.14285714285714285
Synset('pole.n.01') Synset('position.n.06') 0.11764705882352941
Synset('pole.n.01') Synset('position.n.07') 0.15384615384615385
Synset('pole.n.01') Synset('position.n.08') 0.4
Synset('pole.n.01') Synset('position.n.09') 0.11764705882352941
Synset('pole.n.01') Synset('placement.n.03') 0.125
Synset('pole.n.01') Synset('situation.n.02') 0.13333333333333333
Synset('pole.n.01') Synset('position.n.12') 0.13333333333333333
Synset('pole.n.01') Synset('side.n.10') 0.13333333333333333
Synset('pole.n.01') Synset('place.n.15') 0.13333333333333333
Synset('pole.n.01') Synset('stead.n.01') 0.10526315789473684
Synset('pole.n.01') Synset('position.n.16') 0.125
Synset('pole.n.02') Synset('position.n.01') 0.4
Synset('pole.n.02')

In [142]:
maxscore,sem1,sem2

(0.7692307692307693,
 [Synset('pole.n.01'),
  Synset('pole.n.02'),
  Synset('pole.n.03'),
  Synset('perch.n.02'),
  Synset('perch.n.03'),
  Synset('pole.n.06'),
  Synset('pole.n.07'),
  Synset('terminal.n.02'),
  Synset('pole.n.09'),
  Synset('pole.n.10')],
 [Synset('position.n.01'),
  Synset('military_position.n.01'),
  Synset('position.n.03'),
  Synset('position.n.04'),
  Synset('status.n.01'),
  Synset('position.n.06'),
  Synset('position.n.07'),
  Synset('position.n.08'),
  Synset('position.n.09'),
  Synset('placement.n.03'),
  Synset('situation.n.02'),
  Synset('position.n.12'),
  Synset('side.n.10'),
  Synset('place.n.15'),
  Synset('stead.n.01'),
  Synset('position.n.16')])

In [143]:
d={}
for i in all_schemas:
    for j in all_schemas[i]["schema"]:
        x=j.lower()
        if x in d:
            d[x]+=1
        else:
            d[x]=1

In [144]:
d1=sorted(d.items(), key=lambda x: x[1], reverse=True)

In [145]:
d1

[('year', 395),
 ('date', 367),
 ('notes', 274),
 ('name', 261),
 ('rank', 199),
 ('opponent', 161),
 ('position', 146),
 ('location', 142),
 ('venue', 134),
 ('title', 125),
 ('num', 120),
 ('team', 118),
 ('result', 112),
 ('total', 110),
 ('attendance', 108),
 ('competition', 108),
 ('time', 95),
 ('nation', 86),
 ('country', 85),
 ('points', 85),
 ('player', 82),
 ('gold', 78),
 ('silver', 78),
 ('bronze', 78),
 ('score', 77),
 ('season', 70),
 ('nationality', 67),
 ('no ', 64),
 ('event', 61),
 ('type', 58),
 ('place', 56),
 ('winner', 55),
 ('round', 54),
 ('pos', 54),
 ('party', 53),
 ('driver', 52),
 ('no', 51),
 ('role', 49),
 ('laps', 47),
 ('city', 45),
 ('unnamed: 0', 45),
 ('week', 40),
 ('album', 39),
 ('division', 39),
 ('time/retired', 38),
 ('grid', 38),
 ('tournament', 38),
 ('site', 37),
 ('club', 35),
 ('record', 35),
 ('number', 33),
 ('film', 31),
 ('athlete', 31),
 ('constructor', 28),
 ('surface', 28),
 ('pos ', 27),
 ('age', 25),
 ('county', 25),
 ('length', 25

In [147]:
def generate_column_knowledge_graph():
    column_k_graph={}
    for fname in all_schemas:
        for col,dtype in all_schemas[fname]["schema"].items():
            if col not in column_k_graph:
                column_k_graph[col]={"direct":[fname],"similar":{},"d_type":dtype}
            else:
                column_k_graph[col]["direct"].append(fname)
    return column_k_graph

col_k_graph=generate_column_knowledge_graph()

In [150]:
len(col_k_graph["date of appointment"]["direct"])

4

In [178]:
no_of_tables_with_col={}
for i,j in d1:
    if j in no_of_tables_with_col:
        no_of_tables_with_col[j]+=1
    else:
        no_of_tables_with_col[j]=1

In [181]:
y=sorted(no_of_tables_with_col.items())
print(y)

[(1, 2961), (2, 392), (3, 192), (4, 98), (5, 62), (6, 40), (7, 31), (8, 22), (9, 20), (10, 15), (11, 7), (12, 11), (13, 10), (14, 18), (15, 13), (16, 5), (17, 4), (18, 7), (19, 3), (20, 3), (21, 5), (22, 3), (23, 3), (24, 3), (25, 3), (27, 1), (28, 2), (31, 2), (33, 1), (35, 2), (37, 1), (38, 3), (39, 2), (40, 1), (45, 2), (47, 1), (49, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 1), (58, 1), (61, 1), (64, 1), (67, 1), (70, 1), (77, 1), (78, 3), (82, 1), (85, 2), (86, 1), (95, 1), (108, 2), (110, 1), (112, 1), (118, 1), (120, 1), (125, 1), (134, 1), (142, 1), (146, 1), (161, 1), (199, 1), (261, 1), (274, 1), (367, 1), (395, 1)]


In [185]:
print("With Direct Column Name Match :\n")
for i,j in y:
    print(j," columns are present ",i," times.")

With Direct Column Name Match :

2961  columns are present  1  times.
392  columns are present  2  times.
192  columns are present  3  times.
98  columns are present  4  times.
62  columns are present  5  times.
40  columns are present  6  times.
31  columns are present  7  times.
22  columns are present  8  times.
20  columns are present  9  times.
15  columns are present  10  times.
7  columns are present  11  times.
11  columns are present  12  times.
10  columns are present  13  times.
18  columns are present  14  times.
13  columns are present  15  times.
5  columns are present  16  times.
4  columns are present  17  times.
7  columns are present  18  times.
3  columns are present  19  times.
3  columns are present  20  times.
5  columns are present  21  times.
3  columns are present  22  times.
3  columns are present  23  times.
3  columns are present  24  times.
3  columns are present  25  times.
1  columns are present  27  times.
2  columns are present  28  times.
2  columns ar

In [189]:
for i,j in col_k_graph.items():
    print(i)
    print(j)

year
{'direct': ['200_0.csv', '200_1.csv', '200_10.csv', '200_12.csv', '200_22.csv', '200_3.csv', '200_30.csv', '200_32.csv', '200_36.csv', '200_42.csv', '200_46.csv', '200_48.csv', '201_0.csv', '201_15.csv', '201_21.csv', '201_22.csv', '201_23.csv', '201_28.csv', '201_3.csv', '201_30.csv', '201_33.csv', '201_34.csv', '201_36.csv', '201_37.csv', '201_4.csv', '201_40.csv', '201_42.csv', '201_44.csv', '201_45.csv', '201_48.csv', '201_49.csv', '201_6.csv', '201_7.csv', '201_9.csv', '202_11.csv', '202_110.csv', '202_114.csv', '202_13.csv', '202_135.csv', '202_137.csv', '202_15.csv', '202_167.csv', '202_178.csv', '202_184.csv', '202_195.csv', '202_197.csv', '202_198.csv', '202_200.csv', '202_205.csv', '202_211.csv', '202_218.csv', '202_219.csv', '202_228.csv', '202_241.csv', '202_242.csv', '202_248.csv', '202_257.csv', '202_278.csv', '202_279.csv', '202_28.csv', '202_283.csv', '202_29.csv', '202_290.csv', '202_294.csv', '202_31.csv', '202_32.csv', '202_38.csv', '202_43.csv', '202_57.csv', '

title, position
{'direct': ['202_289.csv'], 'similar': {}, 'd_type': 'object'}
grand cross with collar
{'direct': ['202_289.csv'], 'similar': {}, 'd_type': 'object'}
year of appointment
{'direct': ['202_289.csv'], 'similar': {}, 'd_type': 'object'}
entrant
{'direct': ['202_294.csv', '203_1.csv', '203_406.csv', '203_670.csv', '203_854.csv', '204_378.csv', '204_511.csv', '204_826.csv', '204_955.csv', '204_976.csv'], 'similar': {}, 'd_type': 'object'}
chassis
{'direct': ['202_294.csv', '203_1.csv', '203_218.csv', '203_366.csv', '203_406.csv', '203_493.csv', '203_99.csv', '204_205.csv', '204_378.csv', '204_511.csv', '204_531.csv', '204_808.csv', '204_826.csv', '204_864.csv', '204_939.csv'], 'similar': {}, 'd_type': 'object'}
engine
{'direct': ['202_294.csv', '203_1.csv', '203_218.csv', '203_264.csv', '203_366.csv', '203_406.csv', '203_493.csv', '203_525.csv', '203_99.csv', '204_205.csv', '204_366.csv', '204_378.csv', '204_511.csv', '204_531.csv', '204_808.csv', '204_826.csv', '204_864.csv'

{'direct': ['203_207.csv', '204_124.csv'], 'similar': {}, 'd_type': 'object'}
coordinates & grid ref
{'direct': ['203_207.csv'], 'similar': {}, 'd_type': 'object'}
year in which notified
{'direct': ['203_207.csv'], 'similar': {}, 'd_type': 'int64'}
other designations
{'direct': ['203_207.csv'], 'similar': {}, 'd_type': 'object'}
map & citation
{'direct': ['203_207.csv'], 'similar': {}, 'd_type': 'object'}
position in 1993 94
{'direct': ['203_208.csv'], 'similar': {}, 'd_type': 'object'}
margin
{'direct': ['203_210.csv', '203_255.csv', '203_584.csv', '204_683.csv'], 'similar': {}, 'd_type': 'object'}
teams
{'direct': ['203_210.csv', '203_246.csv', '203_424.csv', '204_962.csv'], 'similar': {}, 'd_type': 'object'}
land area (km2)
{'direct': ['203_212.csv', '203_510.csv', '204_143.csv', '204_43.csv'], 'similar': {}, 'd_type': 'float64'}
density (persons/km2)
{'direct': ['203_212.csv'], 'similar': {}, 'd_type': 'int64'}
works number
{'direct': ['203_213.csv', '203_223.csv', '203_863.csv', '

{'direct': ['203_415.csv'], 'similar': {}, 'd_type': 'object'}
manufacturer's championship
{'direct': ['203_415.csv'], 'similar': {}, 'd_type': 'object'}
grades
{'direct': ['203_416.csv'], 'similar': {}, 'd_type': 'object'}
establishment
{'direct': ['203_416.csv'], 'similar': {}, 'd_type': 'int64'}
charter
{'direct': ['203_416.csv'], 'similar': {}, 'd_type': 'object'}
rnd 
{'direct': ['203_417.csv', '204_389.csv'], 'similar': {}, 'd_type': 'object'}
conf 
{'direct': ['203_417.csv', '204_389.csv'], 'similar': {}, 'd_type': 'object'}
starting price
{'direct': ['203_42.csv', '204_561.csv', '204_856.csv'], 'similar': {}, 'd_type': 'object'}
stadium
{'direct': ['203_420.csv', '203_434.csv', '203_472.csv', '204_195.csv', '204_392.csv', '204_440.csv', '204_567.csv', '204_67.csv', '204_750.csv', '204_770.csv', '204_992.csv'], 'similar': {}, 'd_type': 'object'}
until
{'direct': ['203_421.csv', '204_832.csv'], 'similar': {}, 'd_type': 'object'}
relationship with predecessor
{'direct': ['203_421.

licensee
{'direct': ['203_657.csv', '203_749.csv'], 'similar': {}, 'd_type': 'object'}
locomotive
{'direct': ['203_658.csv', '204_816.csv'], 'similar': {}, 'd_type': 'int64'}
injuries (us $000)
{'direct': ['203_66.csv'], 'similar': {}, 'd_type': 'object'}
deaths (age <15)
{'direct': ['203_66.csv'], 'similar': {}, 'd_type': 'float64'}
cpsc toy safety funding (us$ millions)
{'direct': ['203_66.csv'], 'similar': {}, 'd_type': 'object'}
toy sales (us $ billions)
{'direct': ['203_66.csv'], 'similar': {}, 'd_type': 'float64'}
obamapercentage
{'direct': ['203_660.csv'], 'similar': {}, 'd_type': 'object'}
obamanum
{'direct': ['203_660.csv'], 'similar': {}, 'd_type': 'int64'}
mccainpercentage
{'direct': ['203_660.csv'], 'similar': {}, 'd_type': 'object'}
mccainnum
{'direct': ['203_660.csv'], 'similar': {}, 'd_type': 'int64'}
peak chart positions us mod
{'direct': ['203_661.csv', '203_751.csv'], 'similar': {}, 'd_type': 'float64'}
poll source
{'direct': ['203_662.csv', '204_0.csv', '204_481.csv'

reintroduced
{'direct': ['204_111.csv'], 'similar': {}, 'd_type': 'object'}
beanie baby resembled
{'direct': ['204_111.csv'], 'similar': {}, 'd_type': 'object'}
discipline
{'direct': ['204_112.csv', '204_345.csv', '204_607.csv', '204_921.csv'], 'similar': {}, 'd_type': 'object'}
state/ province
{'direct': ['204_113.csv'], 'similar': {}, 'd_type': 'object'}
csa/cma 2009 population
{'direct': ['204_113.csv'], 'similar': {}, 'd_type': 'int64'}
projected 2025 population
{'direct': ['204_113.csv'], 'similar': {}, 'd_type': 'float64'}
projected increase 2009 2025
{'direct': ['204_113.csv'], 'similar': {}, 'd_type': 'object'}
track
{'direct': ['204_115.csv', '204_508.csv', '204_630.csv'], 'similar': {}, 'd_type': 'object'}
round result
{'direct': ['204_115.csv'], 'similar': {}, 'd_type': 'object'}
date published
{'direct': ['204_116.csv'], 'similar': {}, 'd_type': 'object'}
n anastasiades
{'direct': ['204_116.csv'], 'similar': {}, 'd_type': 'object'}
g lillikas
{'direct': ['204_116.csv'], 'si

{'direct': ['204_359.csv'], 'similar': {}, 'd_type': 'int64'}
airdate
{'direct': ['204_362.csv', '204_456.csv'], 'similar': {}, 'd_type': 'object'}
game 1
{'direct': ['204_362.csv'], 'similar': {}, 'd_type': 'object'}
game 2
{'direct': ['204_362.csv'], 'similar': {}, 'd_type': 'object'}
game 3
{'direct': ['204_362.csv'], 'similar': {}, 'd_type': 'object'}
chart positions usa billboard hot dance club play
{'direct': ['204_365.csv'], 'similar': {}, 'd_type': 'float64'}
chart positions polish charts
{'direct': ['204_365.csv'], 'similar': {}, 'd_type': 'float64'}
officer
{'direct': ['204_368.csv'], 'similar': {}, 'd_type': 'object'}
current officers
{'direct': ['204_368.csv'], 'similar': {}, 'd_type': 'object'}
superseded by
{'direct': ['204_368.csv'], 'similar': {}, 'd_type': 'object'}
royal household
{'direct': ['204_368.csv'], 'similar': {}, 'd_type': 'object'}
golfer
{'direct': ['204_369.csv'], 'similar': {}, 'd_type': 'object'}
match play
{'direct': ['204_369.csv'], 'similar': {}, 'd_

{'direct': ['204_575.csv', '204_877.csv', '204_925.csv', '204_978.csv'], 'similar': {}, 'd_type': 'int64'}
football league trophy
{'direct': ['204_575.csv'], 'similar': {}, 'd_type': 'int64'}
best
{'direct': ['204_576.csv'], 'similar': {}, 'd_type': 'object'}
world rank
{'direct': ['204_576.csv'], 'similar': {}, 'd_type': 'object'}
first operational
{'direct': ['204_577.csv'], 'similar': {}, 'd_type': 'object'}
numeral system
{'direct': ['204_577.csv'], 'similar': {}, 'd_type': 'object'}
computing mechanism
{'direct': ['204_577.csv'], 'similar': {}, 'd_type': 'object'}
programming
{'direct': ['204_577.csv', '204_793.csv'], 'similar': {}, 'd_type': 'object'}
turing complete
{'direct': ['204_577.csv'], 'similar': {}, 'd_type': 'object'}
population (as of 2011)
{'direct': ['204_579.csv'], 'similar': {}, 'd_type': 'int64'}
position 2
{'direct': ['204_58.csv'], 'similar': {}, 'd_type': 'object'}
home town, county
{'direct': ['204_580.csv'], 'similar': {}, 'd_type': 'object'}
term of service

trucks
{'direct': ['204_89.csv'], 'similar': {}, 'd_type': 'object'}
chart year
{'direct': ['204_895.csv'], 'similar': {}, 'd_type': 'int64'}
billboard hot 100
{'direct': ['204_895.csv'], 'similar': {}, 'd_type': 'object'}
billboard hot r&b/hip hop
{'direct': ['204_895.csv'], 'similar': {}, 'd_type': 'float64'}
conf (overall)
{'direct': ['204_898.csv'], 'similar': {}, 'd_type': 'object'}
tiebreaker
{'direct': ['204_898.csv'], 'similar': {}, 'd_type': 'object'}
tornadoes
{'direct': ['204_899.csv'], 'similar': {}, 'd_type': 'object'}
works numbers
{'direct': ['204_901.csv'], 'similar': {}, 'd_type': 'object'}
cn numbers
{'direct': ['204_901.csv'], 'similar': {}, 'd_type': 'object'}
gt numbers
{'direct': ['204_901.csv'], 'similar': {}, 'd_type': 'object'}
broadcaster
{'direct': ['204_903.csv'], 'similar': {}, 'd_type': 'object'}
lift name
{'direct': ['204_907.csv'], 'similar': {}, 'd_type': 'object'}
top elevation (ft)
{'direct': ['204_907.csv'], 'similar': {}, 'd_type': 'int64'}
bottom e

In [190]:
pd.read_csv("204_63.csv")

Unnamed: 0,round,circuit,date,pole position,winning driver,winning team,trophy winner
0,1,"ACI Vallelunga Circuit, Campagnano di Roma",7 April,Miloš Pavlović,Miloš Pavlović,Target Racing,Nino Famà
1,2,Misano World Circuit,28 April,Miloš Pavlović,Miloš Pavlović,Target Racing,Giovanni Faraonio
2,3,"Autodromo di Pergusa, Enna",19 May,Miloš Pavlović,Philip Cloostermans,Azeta Racing,Carmine Tancredi
3,4,Autodromo Nazionale Monza,30 June,Philip Cloostermans,Philip Cloostermans,Azeta Racing,Carmine Tancredi
4,5,"Autodromo Riccardo Paletti, Varano",21 July,Miloš Pavlović,Miloš Pavlović,Target Racing,Carmine Tancredi
5,6,"Autodromo Enzo e Dino Ferrari, Imola",1 September,Vitantonio Liuzzi,Vitantonio Liuzzi,Bertram Schäfer Racing,Silvio Alberti
6,7,"Autodromo del Levante, Binetto",8 September,Christiano Citron,Miloš Pavlović,Target Racing,Carmine Tancredi
7,8,"Mugello Circuit, Scarperia",6 October,Miloš Pavlović,Miloš Pavlović,Target Racing,Dino Lusuardi
8,9,"Autodromo dell'Umbria, Magione",20 October,Andreas Zuber,Christiano Citron,Target Racing,Alberto Morelli


In [213]:
x=pd.read_csv("203_148.csv")

In [224]:
import json
import urllib
api_key = "AIzaSyCJdqNKHF6A9V6CxKCROT1tbp2DoHZgMKk"
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
params = {
    'limit': 100,
    'indent': True,
    'key': api_key,
}
s={}
for row in x["name"]:
    query = row
    params['query']= query
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    
    for element in response['itemListElement']:
        for i in element["result"]["@type"]:
            a=i.lower()
            if a in s:
                s[a]+=1
            else:
                s[a]=1
        if "description" in element["result"]:
            a=element["result"]["description"].lower()
            if a in s:
                s[a]+=1
            else:
                s[a]=1

In [225]:
s

{'thing': 542,
 'organization': 24,
 'sportsteam': 18,
 'cycling team': 2,
 'soccer club': 6,
 'book': 7,
 'book by antoon sanders': 1,
 'basketball club': 1,
 'corporation': 8,
 'record label': 1,
 'soccer team': 2,
 'ship': 13,
 'place': 60,
 'professional sports team': 1,
 'person': 226,
 "dulce of aragon's son": 1,
 'philosopher': 1,
 'argentine goalkeeper': 1,
 'soccer forward': 1,
 'event': 35,
 'soccer player': 8,
 'road cycling team': 5,
 'argentine soccer player': 13,
 'plants': 1,
 'city': 38,
 'town in argentina': 1,
 'argentinean soccer player': 1,
 'sportsorganization': 2,
 'league': 2,
 'soccer goalkeeper': 1,
 'soccer defender': 1,
 'sports league season': 4,
 'belgian bicycler': 1,
 'tournament': 1,
 'football league season': 1,
 'paraguayan soccer player': 1,
 'aerospace company': 1,
 'country': 2,
 'administrativearea': 4,
 'country in central africa': 1,
 'company': 2,
 'bank': 1,
 'musiccomposition': 1,
 'opera by claudio monteverdi': 1,
 'city in chile': 1,
 'disas

In [227]:
x=set(s)

In [228]:
x

{'1916 film',
 '1917 film',
 '2005 film',
 '2006 film',
 '2012 film',
 '2015 film',
 '2017 film',
 'actress',
 'administrativearea',
 'administrator of the small business administration',
 'aerospace company',
 'aircraft carrier',
 'albanian fighter',
 'album by don cornelio y la zona',
 'album by minami kuribayashi',
 'album by neztic &amp; vate',
 'album by quilapayún',
 'album by tervahäät',
 'american activist',
 'american actress',
 'american art historian',
 'american astrologer',
 'american author',
 'american biographer',
 'american businesswoman',
 'american character actress',
 'american dancer',
 'american film actress',
 'american film director',
 'american journalist',
 'american mathematician',
 'american model',
 'american music composer',
 'american musician',
 'american navy man',
 'american painter',
 'american philanthropist',
 'american photographer',
 'american political activist',
 'american political scientist',
 'american pornographic actress',
 'american profes

In [232]:
a=np.nan

In [231]:
str(a)

'nan'