In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

# World Bank

In [22]:
query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT DISTINCT ?country ?sameAsCountry ?label ?life_expectancy WHERE {

    ?country a <http://worldbank.org/Country> .
    #?country <http://www.w3.org/2000/01/rdf-schema#label> ?label .
    ?country <http://purl.org/dc/elements/1.1/identifier> ?label .
    OPTIONAL{?country <http://www.w3.org/2002/07/owl#sameAs> ?sameAsCountry .}
}
"""

In [23]:
sparql = SPARQLWrapper("https://labs.tib.eu/sdm/worldbank_endpoint/sparql")
world_bank = []

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()


for r in results['results']['bindings']:
    row = {'country': r['country']['value'], 'label': r['label']['value']}
    if 'sameAsCountry' in r:
        row['sameAsCountry']= r['sameAsCountry']['value']
#     print(row, str(r['o']['value']))
    world_bank.append(row)

world_bank = pd.DataFrame.from_dict(world_bank)

# Countries without the property sameAs

In [25]:
country_nan_link = world_bank[world_bank['sameAsCountry'].isna()]
display(country_nan_link.shape, country_nan_link.head())

(50, 3)

Unnamed: 0,country,label,sameAsCountry
859,http://worldbank.org/Country/AFE,AFE,
860,http://worldbank.org/Country/AFW,AFW,
861,http://worldbank.org/Country/ARB,ARB,
862,http://worldbank.org/Country/CEB,CEB,
863,http://worldbank.org/Country/CHI,CHI,


In [5]:
country_nan_link.to_csv('country_without_link_to_Wikipedia.csv', index=None)

In [26]:
world_bank=world_bank.dropna().reset_index(drop=True)
world_bank = world_bank[world_bank['sameAsCountry'].str.contains('http://www.wikidata.org/entity/')]
world_bank

Unnamed: 0,country,label,sameAsCountry
3,http://worldbank.org/Country/ABW,ABW,http://www.wikidata.org/entity/Q21203
7,http://worldbank.org/Country/AFG,AFG,http://www.wikidata.org/entity/Q889
11,http://worldbank.org/Country/AGO,AGO,http://www.wikidata.org/entity/Q916
15,http://worldbank.org/Country/ALB,ALB,http://www.wikidata.org/entity/Q222
19,http://worldbank.org/Country/AND,AND,http://www.wikidata.org/entity/Q228
...,...,...,...
842,http://worldbank.org/Country/WSM,WSM,http://www.wikidata.org/entity/Q683
846,http://worldbank.org/Country/YEM,YEM,http://www.wikidata.org/entity/Q805
850,http://worldbank.org/Country/ZAF,ZAF,http://www.wikidata.org/entity/Q258
854,http://worldbank.org/Country/ZMB,ZMB,http://www.wikidata.org/entity/Q953


# Countries with more that one link to wikidata

In [6]:
display(len(world_bank.country.unique()), len(world_bank.sameAsCountry.unique()))

215

215

In [7]:
display(world_bank.sameAsCountry.value_counts()[:8])

http://www.wikidata.org/entity/Q21203    1
http://www.wikidata.org/entity/Q1027     1
http://www.wikidata.org/entity/Q833      1
http://www.wikidata.org/entity/Q1030     1
http://www.wikidata.org/entity/Q33788    1
http://www.wikidata.org/entity/Q1032     1
http://www.wikidata.org/entity/Q1033     1
http://www.wikidata.org/entity/Q811      1
Name: sameAsCountry, dtype: int64

In [8]:
world_bank.loc[world_bank.sameAsCountry=='http://www.wikidata.org/entity/Q423']

Unnamed: 0,country,label,sameAsCountry
630,http://worldbank.org/Country/PRK,PRK,http://www.wikidata.org/entity/Q423


# Contries from WIKI with property 'life_expectancy'

In [19]:
query = """
SELECT DISTINCT ?sameAsCountry ?o
WHERE 
{
  #?sameAsCountry wdt:P31 wd:Q6256 .
  ?sameAsCountry wdt:P298 ?o .
  ?sameAsCountry p:P2250 ?item .
  ?item ps:P2250 ?year_exp .
  OPTIONAL{?item pq:P518 ?gender .}
}
"""
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
wiki = []

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
row['sameAsCountry']=[]

for r in results['results']['bindings']:
    row = {'sameAsCountry': r['sameAsCountry']['value'], 'label': r['o']['value']}
    wiki.append(row)
    

wiki = pd.DataFrame.from_dict(wiki)
display(wiki.shape, wiki.head(2))

(206, 2)

Unnamed: 0,sameAsCountry,label
0,http://www.wikidata.org/entity/Q34,SWE
1,http://www.wikidata.org/entity/Q20,NOR


# Load file from a Federate query. 

In [10]:
country_interlinking = pd.read_csv('interlinking.csv')
display(country_interlinking.shape, country_interlinking.head(2))

(202, 1)

Unnamed: 0,sameAsCountry
0,http://www.wikidata.org/entity/Q781
1,http://www.wikidata.org/entity/Q408


In [20]:
df1 = wiki.merge(country_interlinking, how = 'outer', on='sameAsCountry',
                              indicator=True).loc[lambda x : x['_merge']=='left_only'].reset_index()
df1

Unnamed: 0,index,sameAsCountry,label,_merge
0,152,http://www.wikidata.org/entity/Q219060,PSE,left_only
1,153,http://www.wikidata.org/entity/Q4628,FRO,left_only
2,193,http://www.wikidata.org/entity/Q238,SMR,left_only
3,203,http://www.wikidata.org/entity/Q230,GEO,left_only


In [27]:
world_bank.loc[world_bank.label.isin(list(df1.label))]

Unnamed: 0,country,label,sameAsCountry
259,http://worldbank.org/Country/FRO,FRO,http://www.wikidata.org/entity/Q4575
275,http://worldbank.org/Country/GEO,GEO,http://www.wikidata.org/entity/Q1428
642,http://worldbank.org/Country/PSE,PSE,http://www.wikidata.org/entity/Q407199
694,http://worldbank.org/Country/SMR,SMR,http://www.wikidata.org/entity/Q238


In [21]:
df2 = wiki.merge(country_interlinking, how = 'outer', on='sameAsCountry',
                              indicator=True).loc[lambda x : x['_merge']=='right_only'].reset_index()
display(df2.shape, df2.head(2))

(0, 4)

Unnamed: 0,index,sameAsCountry,label,_merge


In [15]:
len(wiki.sameAsCountry.unique())

167

# Federate Query

In [9]:
sameAsCountry = pd.read_csv('Query-1.csv')
sameAsCountry

Unnamed: 0,country,sameAsCountry,life_expectancy
0,http://worldbank.org/Country/AGO,http://www.wikidata.org/entity/Q916,61.54700
1,http://worldbank.org/Country/AFG,http://www.wikidata.org/entity/Q889,63.67300
2,http://worldbank.org/Country/ABW,http://www.wikidata.org/entity/Q21203,75.86700
3,http://worldbank.org/Country/ARE,http://www.wikidata.org/entity/Q878,77.25600
4,http://worldbank.org/Country/ALB,http://www.wikidata.org/entity/Q222,78.34500
...,...,...,...
163,http://worldbank.org/Country/KWT,http://www.wikidata.org/entity/Q817,74.69400
164,http://worldbank.org/Country/LTU,http://www.wikidata.org/entity/Q37,74.32195
165,http://worldbank.org/Country/LVA,http://www.wikidata.org/entity/Q211,74.52927
166,http://worldbank.org/Country/MAR,http://www.wikidata.org/entity/Q1028,75.82100


In [10]:
display(len(sameAsCountry.country.unique()), len(sameAsCountry.sameAsCountry.unique()))

167

161

# Countries with 'wrong' links to wikidata

In [11]:
df1 = world_bank.merge(sameAsCountry, how = 'outer', on = ['country', 'sameAsCountry'],
                       indicator=True).loc[lambda x :x['_merge']=='left_only'].reset_index()
df1

Unnamed: 0,index,country,label,sameAsCountry,life_expectancy,_merge
0,4,http://worldbank.org/Country/AND,Andorra,http://www.wikidata.org/entity/Q228,,left_only
1,6,http://worldbank.org/Country/ARG,Argentina,http://www.wikidata.org/entity/Q154574,,left_only
2,8,http://worldbank.org/Country/ASM,American Samoa,http://www.wikidata.org/entity/Q16641,,left_only
3,10,http://worldbank.org/Country/AUS,Australia,http://www.wikidata.org/entity/Q781244,,left_only
4,11,http://worldbank.org/Country/AUT,Austria,http://www.wikidata.org/entity/Q131964,,left_only
5,14,http://worldbank.org/Country/BEL,Belgium,http://www.wikidata.org/entity/Q239,,left_only
6,26,http://worldbank.org/Country/BRA,Brazil,http://www.wikidata.org/entity/Q598435,,left_only
7,33,http://worldbank.org/Country/CHE,Switzerland,http://www.wikidata.org/entity/Q4225,,left_only
8,35,http://worldbank.org/Country/CHN,China,http://www.wikidata.org/entity/Q136211,,left_only
9,40,http://worldbank.org/Country/COL,Colombia,http://www.wikidata.org/entity/Q62823,,left_only


In [12]:
len(df1.country.unique())

48

In [14]:
df1[['country', 'label', 'sameAsCountry']].to_csv('Countries_with_wrong_link_to_wikidata.csv', index=None)


In [29]:
set(world_bank.country).difference(set(sameAsCountry.country))

{'http://worldbank.org/Country/AND',
 'http://worldbank.org/Country/ARG',
 'http://worldbank.org/Country/ASM',
 'http://worldbank.org/Country/AUS',
 'http://worldbank.org/Country/AUT',
 'http://worldbank.org/Country/BEL',
 'http://worldbank.org/Country/BRA',
 'http://worldbank.org/Country/CHE',
 'http://worldbank.org/Country/CHN',
 'http://worldbank.org/Country/COL',
 'http://worldbank.org/Country/DEU',
 'http://worldbank.org/Country/DNK',
 'http://worldbank.org/Country/ESP',
 'http://worldbank.org/Country/FRA',
 'http://worldbank.org/Country/FRO',
 'http://worldbank.org/Country/GBR',
 'http://worldbank.org/Country/GEO',
 'http://worldbank.org/Country/GIB',
 'http://worldbank.org/Country/GRL',
 'http://worldbank.org/Country/IND',
 'http://worldbank.org/Country/ISR',
 'http://worldbank.org/Country/JPN',
 'http://worldbank.org/Country/LBN',
 'http://worldbank.org/Country/LBY',
 'http://worldbank.org/Country/MAF',
 'http://worldbank.org/Country/MCO',
 'http://worldbank.org/Country/MEX',
 