In [1]:
import pandas as pd

In [2]:
DATA_FILE = "../../data/data_worldbank/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_2055804.csv"
METADATA_COUNTRY_FILE = "../../data/data_worldbank/Metadata_Country_API_NY.GDP.PCAP.CD_DS2_en_csv_v2_2055804.csv"

df = pd.read_csv(DATA_FILE, skiprows=4)

# Analysis for relevant data subset

In [3]:
df.head(3)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Aruba,ABW,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,24713.698045,26189.435509,26647.938101,27980.880695,28281.350482,29007.693003,,,,
1,Afghanistan,AFG,GDP per capita (current US$),NY.GDP.PCAP.CD,59.773194,59.860874,58.458015,78.706388,82.095231,101.108305,...,641.871479,637.165523,613.856689,578.466353,509.218661,519.884773,493.750418,507.103432,,
2,Angola,AGO,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,5100.095808,5254.882338,5408.410496,4166.979684,3506.072885,4095.812942,3289.646664,2790.726615,,


In [4]:
# size of dataset
df.shape

(264, 66)

In [5]:
relevant_cols = "Country Name, Country Code, Indicator Code, 2019, 2020".split(", ")
df[relevant_cols].isna().sum()

Country Name        0
Country Code        0
Indicator Code      0
2019               34
2020              264
dtype: int64

In [6]:
# key columns (country etc.) don't have missing values.
# GDP values:
#   - for 2020 no GDP value exists
#   - 2019 is the latest year with GDP values and has 34 missing values only.
relevant_cols.remove('2020')

# Preprocessing

In [7]:
# select relevant data
INDICATOR_NAME = df['Indicator Name'].iloc[0]

df = df[relevant_cols]

print(f"indicator name: {INDICATOR_NAME}")
df.head()

indicator name: GDP per capita (current US$)


Unnamed: 0,Country Name,Country Code,Indicator Code,2019
0,Aruba,ABW,NY.GDP.PCAP.CD,
1,Afghanistan,AFG,NY.GDP.PCAP.CD,507.103432
2,Angola,AGO,NY.GDP.PCAP.CD,2790.726615
3,Albania,ALB,NY.GDP.PCAP.CD,5353.244856
4,Andorra,AND,NY.GDP.PCAP.CD,40886.391165


In [8]:
df[df['2019'].isna()]

Unnamed: 0,Country Name,Country Code,Indicator Code,2019
0,Aruba,ABW,NY.GDP.PCAP.CD,
9,American Samoa,ASM,NY.GDP.PCAP.CD,
36,Channel Islands,CHI,NY.GDP.PCAP.CD,
48,Cuba,CUB,NY.GDP.PCAP.CD,
50,Cayman Islands,CYM,NY.GDP.PCAP.CD,
67,Eritrea,ERI,NY.GDP.PCAP.CD,
76,Faroe Islands,FRO,NY.GDP.PCAP.CD,
77,"Micronesia, Fed. Sts.",FSM,NY.GDP.PCAP.CD,
82,Gibraltar,GIB,NY.GDP.PCAP.CD,
89,Greenland,GRL,NY.GDP.PCAP.CD,


In [9]:
print(df.shape)
df = df.dropna()
print(df.shape)

(264, 4)
(230, 4)


In [10]:
# rounding GDP value
df["2019"] = df["2019"].round(2)

# RDF Conversion

In [11]:
from rdflib import Graph, Literal, URIRef, Namespace #basic RDF handling
from rdflib.namespace import RDF, RDFS, FOAF, XSD  #most common namespaces
import urllib.parse #for parsing strings to URI's

In [12]:
df.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Code,2019
1,Afghanistan,AFG,NY.GDP.PCAP.CD,507.1
2,Angola,AGO,NY.GDP.PCAP.CD,2790.73


In [13]:
# Initializing graph and PREFIXES
g = Graph()
schema = Namespace('https://schema.org/')
geonames = Namespace('https://www.geonames.org/countries/')
wikidata_predicate = Namespace('https://www.wikidata.org/wiki/Property:')
project = Namespace("https://w3id.org/um/ken4256/project/reification/")

g.bind("schema", schema)
g.bind("geonames", geonames)
g.bind("wikidata_predicate", wikidata_predicate)
g.bind("project", project)


# Create triples
for index, row in df.iterrows():
    
    # data
    g.add((URIRef(geonames[row['Country Code']+"/"]), URIRef(RDF.type), URIRef(schema.Country)))
    g.add((URIRef(geonames[row['Country Code']+"/"]), URIRef(RDFS.label), Literal(row['Country Name'], lang='en')))
    g.add((URIRef(geonames[row['Country Code']+"/"]), URIRef(wikidata_predicate.P2132), Literal(row['2019'], datatype=XSD.float)))
    # here using datatype XSD.float according to:
    # https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=float#rdflib.term.Literal
    
    # Reification statements
    g.add((URIRef(project['statement_'+row['Country Code']]), URIRef(RDF.type), URIRef(RDF.Statement)))
    g.add((URIRef(project['statement_'+row['Country Code']]), URIRef(RDF.subject), URIRef(geonames[row['Country Code']+"/"])))
    g.add((URIRef(project['statement_'+row['Country Code']]), URIRef(RDF.predicate), URIRef(wikidata_predicate.P2132)))
    g.add((URIRef(project['statement_'+row['Country Code']]), URIRef(RDF.object), Literal(row['2019'], datatype=XSD.integer)))
    g.add((URIRef(project['statement_'+row['Country Code']]), URIRef(RDFS.comment), Literal(f"Shows the nominal gross domestic product from the year 2019 for {row['Country Name']}, see https://www.wikidata.org/wiki/Q28501082", lang="en")))
    g.add((URIRef(project['statement_'+row['Country Code']]), URIRef(RDFS.label), Literal(f"nominal GDP 2019 for {row['Country Name']}", lang="en")))
        
g.serialize('output_dataworldbank.ttl', format='turtle')

In [68]:
#print(g.serialize(format='turtle').decode('UTF-8'))

@prefix ns1: <https://www.wikidata.org/wiki/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://w3id.org/um/ken4256/project/reification/statement_AFG> a rdf:Statement ;
    rdfs:label "nominal GDP 2019 for Afghanistan"@en ;
    rdf:object 507.1 ;
    rdf:predicate ns1:Property:P2132 ;
    rdf:subject <https://www.geonames.org/countries/AFG/> ;
    rdfs:comment "Shows the nominal gross domestic product from the year 2019 for Afghanistan, see https://www.wikidata.org/wiki/Q28501082"@en .

<https://w3id.org/um/ken4256/project/reification/statement_AGO> a rdf:Statement ;
    rdfs:label "nominal GDP 2019 for Angola"@en ;
    rdf:object 2790.73 ;
    rdf:predicate ns1:Property:P2132 ;
    rdf:subject <https://www.geonames.org/countries/AGO/> ;
    rdfs:comment "Shows the nominal gross domestic product from the year 2019 for Angola, see https://www.wikidata.org/wik