In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np

In [2]:
query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT DISTINCT ?country ?indicator ?value ?year WHERE {

    ?country a <http://worldbank.org/Country> .
    ?country <http://worldbank.org/hasAnnualIndicatorEntry> ?annualIndicator .
    ?annualIndicator <http://www.w3.org/2002/07/owl#hasValue> ?value .
    ?annualIndicator <http://www.w3.org/2006/time#year> ?year .
    ?annualIndicator <http://worldbank.org/hasIndicator> ?indicator .
    
    FILTER(?indicator in (<http://worldbank.org/Indicator/NY.GDP.MKTP.CD>,
    <http://worldbank.org/Indicator/NE.DAB.TOTL.CD>, <http://worldbank.org/Indicator/NY.GDP.PCAP.CN>,
    <http://worldbank.org/Indicator/SI.POV.GINI>, <http://worldbank.org/Indicator/NV.IND.TOTL.ZS>,
    <http://worldbank.org/Indicator/EN.ATM.CO2E.PC>) AND (?year=2019))

}
"""

In [3]:
def convert_to_category(world_bank, indicator):
    df = world_bank.loc[world_bank.indicator==indicator]
    df["value"] = pd.to_numeric(df["value"], downcast="float")
    df = df.sort_values(by='value')  #, ascending=False
    a = list(df['value'].values)
    n_split = np.array_split(a, 5)
    category = pd.cut(df.value, bins=[min(n_split[0])-1, max(n_split[0]), max(n_split[1]), max(n_split[2]),
                                      max(n_split[3]), max(n_split[4])],
                      labels=['low', 'mediumLow', 'medium', 'mediumHigh', 'high'])
    df.insert(3, 'category_indicator',category)
    df.drop(columns='value', inplace=True)
    return df

In [4]:
sparql = SPARQLWrapper("https://labs.tib.eu/sdm/worldbank_endpoint/sparql")
world_bank = []

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()


for r in results['results']['bindings']:
    row = {'country': r['country']['value'].replace('http://worldbank.org/Country/', ''),
           'indicator': r['indicator']['value'].replace('http://worldbank.org/Indicator/', ''),
           'value': r['value']['value']}
    
    world_bank.append(row)

world_bank = pd.DataFrame.from_dict(world_bank)
display(world_bank.shape, world_bank.head(2))

(1218, 3)

Unnamed: 0,country,indicator,value
0,MWI,NY.GDP.MKTP.CD,11025400000.0
1,MWI,NV.IND.TOTL.ZS,18.5355


In [5]:
country_ind = pd.DataFrame()
for indicator in ['NY.GDP.MKTP.CD', 'NE.DAB.TOTL.CD', 'NY.GDP.PCAP.CN',
                  'SI.POV.GINI', 'NV.IND.TOTL.ZS', 'EN.ATM.CO2E.PC']:
    country_ind = pd.concat([country_ind, convert_to_category(world_bank, indicator)])

country_ind

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["value"] = pd.to_numeric(df["value"], downcast="float")


Unnamed: 0,country,indicator,category_indicator
266,TUV,NY.GDP.MKTP.CD,low
46,NRU,NY.GDP.MKTP.CD,low
728,KIR,NY.GDP.MKTP.CD,low
833,MHL,NY.GDP.MKTP.CD,low
79,PLW,NY.GDP.MKTP.CD,low
...,...,...,...
1004,BRN,EN.ATM.CO2E.PC,high
981,ARE,EN.ATM.CO2E.PC,high
999,BHR,EN.ATM.CO2E.PC,high
1098,KWT,EN.ATM.CO2E.PC,high


In [6]:
country_ind = country_ind.drop_duplicates()
country_ind.shape

(1218, 3)

In [7]:
country_ind.to_csv('dataset/country_ind.csv', index=None)