In [1]:
import json

## Load the file

In [3]:
with open("merged_wiki_df.json",'r') as fd:
    df=json.load(fd)

In [4]:
df.keys()

dict_keys(['error_list', 'attr_freq', 'tot_success', 'tot_failures', 'data'])

In [5]:
df=df['data']

#### Use Sergey Brin as an example for exploration

In [6]:
curr_dict=None
for idx, val in enumerate(df):
    name=val['ire_person_name']
    if "Brin" in name:
        print(f"{idx}:{name}")

6:Per Brinch Hansen
562:Duncan Brinsmead
642:Willard C. Brinton
655:Selmer Bringsjord
1641:Sjaak Brinkkemper
2801:Sergey Brin


In [None]:
curr_dict=df[2801]

In [8]:
curr_dict['wikidata']

{'employer (P108)': 'Alphabet Inc. (Q20800404)',
 'native language (P103)': 'Russian (Q7737)',
 'Commons category (P373)': 'Sergey Brin',
 'VIAF ID (P214)': '120111435',
 'ISNI (P213)': '0000 0000 7879 1368',
 'Library of Congress authority ID (P244)': 'no2005073928',
 'GND ID (P227)': '132711648',
 'spouse (P26)': ['Anne Wojcicki (Q2069573)', 'Nicole Shanahan (Q105100778)'],
 'IMDb ID (P345)': 'nm1962236',
 'image (P18)': 'Sergey Brin Ted 2010.jpg',
 'signature (P109)': 'Sergey Brin google signature.svg',
 'father (P22)': 'Michael Brin (Q15455589)',
 'mother (P25)': 'Eugenia Brin (Q13377792)',
 'date of birth (P569)': '+1973-08-21T00:00:00Z',
 'occupation (P106)': ['computer scientist (Q82594)',
  'inventor (Q205375)',
  'business executive (Q2961975)'],
 'country of citizenship (P27)': ['Soviet Union (Q15180)',
  'United States of America (Q30)'],
 'place of birth (P19)': 'Moscow (Q649)',
 'educated at (P69)': ['University of Maryland (Q503415)',
  'Stanford University (Q41506)',
  '

## Prepare a sorted list alphabetically and by frequency

In [10]:
property_vals=dict()

In [11]:
len(df)

3254

In [12]:
for curr_p in df:
    for curr_prop in curr_p['wikidata']:
        try:
            property_vals[curr_prop]+=1
        except:
            property_vals[curr_prop]=1

In [13]:
data_df=dict()

In [14]:
sort_alpha = sorted(property_vals.items(), key=lambda x: x[0], reverse=False)

In [15]:
sort_freq = sorted(property_vals.items(), key=lambda x: x[1], reverse=True)

In [16]:
data_df['sorted_freq']=sort_freq
data_df['sort_alpha']=sort_alpha

In [17]:
with open("wikidata_attributes_details.json",'w') as fd:
    json.dump(data_df, fd, indent=4)

In [18]:
def populate_sample_vals(prop_name):
    arr=[]
    for curr_p in df:
        if prop_name in curr_p['wikidata']:
            arr.append(curr_p['wikidata'][prop_name])
            #arr.append(type(curr_p['wikidata'][prop_name]))
    with open("check_wikidata_attr.json",'w') as fd:
        json.dump(arr, fd, indent=2,default=str)

In [33]:
now_name="participant in (P1344)"

populate_sample_vals(now_name)

### Remove wikipedia property IDs from attribute names

In [20]:
import re
def rem_wikidata_page_id(q_str):
    q_str=re.sub(r'\(Q.*\)', '', q_str)
    return q_str.rstrip().lstrip()

### Remove hh::mm::ss from dob

In [21]:
def purify_dob(dob):
    curr_str=""
    for curr_ch in dob[1:]:
        if curr_ch=='T':
            break
        curr_str+=curr_ch
    return curr_str

### Tests

In [22]:
rem_wikidata_page_id("female (Q6581072)")

'female'

In [23]:
purify_dob("+1970-08-17T00:00:00Z")

'1970-08-17'

### Shortlisted attributes mapped to their cleaning functions

In [24]:
wikidata_props_function_mapping={
    "sex or gender (P21)":None,
    "occupation (P106)":None,
    "given name (P735)":None,
    "name in native language (P1559)":None,
    "birth name (P1477)":None,
    "languages spoken, written or signed (P1412)":None,
    "native language (P103)":None,
    "writing language (P6886)":None,
    
    "educated at (P69)":None,
    "academic degree (P512)":None,
    "religion (P140)":None,
    "ethnic group (P172)":None,
    "date of birth (P569)":purify_dob,
    "date of death (P570)":purify_dob,
    "cause of death (P509)":None,
    "manner of death (P1196)":None,
    "residence (P551)":None,
    "work location (P937)":None,
    "country of citizenship (P27)":None,
    "employer (P108)":None,
    "spouse (P26)":None,
    "image (P18)":None,
    "position held (P39)":None,
    "place of birth (P19)":None,
    "place of death (P20)":None,
    "award received (P166)":None,
    "DBLP author ID (P2456)":None,
    "family name (P734)":None,
    "father (P22)":None,
    "mother (P25)":None,
    "sibling (P3373)":None
    "child (P40)":None,
    "doctoral student (P185)":None,
    "field of work (P101)":None,
    "member of (P463)":None,
    "Google Scholar author ID (P1960)":None,
    "ACM Digital Library author ID (P864)":None,
    "official website (P856)":None,
    "notable work (P800)":None,
    "Twitter username (P2002)":None,
    "social media followers (P8687)":None,
    "student (P802)":None,
    "student of (P1066)":None,
    "influenced by (P737)":None,
    "participant in (P1344)":None,
    "academic thesis (P1026)":None,
    "GitHub username (P2037)":None,
    "official blog (P1581)":None,
    "YouTube channel ID (P2397)":None,
    "affiliation (P1416)":None,
    "professorship (P803)":None,
    "described at URL (P973)":None,
    
    "Google Knowledge Graph ID (P2671)":None,
    "WorldCat Identities ID (P7859)":None,
    "Library of Congress authority ID (P244)":None,
    
    "ISNI (P213)":None,
    
    "NUKAT ID (P1207)":None,
    "Nationale Thesaurus voor Auteurs ID (P1006)":None,
    
    "IdRef ID (P269)":None,
    "GND ID (P227)":None,
    
    "ResearcherID (P1053)":None,
    "ResearchGate profile ID (P2038)":None,
    "zbMATH author ID (P1556)":None,
    "SHARE Catalogue author ID (P3987)":None,
    "Biblioth\u00e8que nationale de France ID (P268)":None,
    "ORCID iD (P496)":None,
    "PLWABN ID (P7293)":None,
    "Erd\u0151s number (P2021)":None,
    "Scopus author ID (P1153)":None,
    "FAST ID (P2163)":None,
    "National Diet Library ID (P349)":None,
    "Academic Tree ID (P2381)":None,
    "described by source (P1343)":None,
    "CiNii author ID (books) (P271)":None,
    "Encyclop\u00e6dia Britannica Online ID (P1417)":None,
    "LinkedIn personal profile ID one,
    "Semantic Scholar author ID (P4012)":None,
    "IEEE Xplore author ID (P6479)":None,    
}