In [1]:
!python --version

Python 3.5.2 :: Anaconda 4.2.0 (x86_64)


In [2]:
import pandas as pd

### Import and Explore Data

In [3]:
import sqlite3

Source: https://www.kaggle.com/worldbank/world-development-indicators

In [4]:
connection = sqlite3.connect("/Users/Alexandre/Workstation/databases/world_development_indicators.db")

In [5]:
cursor = connection.cursor()

What tables do we have?

In [7]:
df = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", connection)

In [8]:
df

Unnamed: 0,name
0,Country
1,CountryNotes
2,Footnotes
3,Indicators
4,Series
5,SeriesNotes


In [423]:
pd.read_sql_query("SELECT * FROM Country LIMIT 1", connection).set_index('CountryCode').sort_index().drop('index', 1).transpose()

CountryCode,AFG
ShortName,Afghanistan
TableName,Afghanistan
LongName,Islamic State of Afghanistan
Alpha2Code,AF
CurrencyUnit,Afghan afghani
SpecialNotes,Fiscal year end: March 20; reporting period fo...
Region,South Asia
IncomeGroup,Low income
Wb2Code,AF
NationalAccountsBaseYear,2002/03


In [501]:
regions_df = pd.read_sql_query("SELECT TableName, Region FROM Country", connection)
regions_df.head()

Unnamed: 0,TableName,Region
0,Afghanistan,South Asia
1,Albania,Europe & Central Asia
2,Algeria,Middle East & North Africa
3,American Samoa,East Asia & Pacific
4,Andorra,Europe & Central Asia


In [532]:
def get_region(country):
    try:
        region = regions_df.loc[regions_df['TableName'].replace("'", '\'') == country.replace("'", '\'')]['Region'].values[0]

SyntaxError: invalid syntax (<ipython-input-531-dea126ea8fff>, line 1)

Preview of table <strong>Indicators</strong>

In [12]:
pd.read_sql_query("SELECT * FROM Indicators LIMIT 1", connection).set_index('CountryCode').sort_index().drop('index', 1)

Unnamed: 0_level_0,CountryName,IndicatorName,IndicatorCode,Year,Value
CountryCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARB,Arab World,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,133.560907


In [13]:
indicators_df = pd.read_sql_query("SELECT DISTINCT IndicatorName, IndicatorCode FROM Indicators", connection)

In [14]:
indicators_df = indicators_df.sort_values('IndicatorName').reset_index().drop('index', 1).sort_index()

Let's print them all out.

In [15]:
for index, row in indicators_df.iterrows():
    print(row['IndicatorName'] + '  ' + row['IndicatorCode'])

2005 PPP conversion factor, GDP (LCU per international $)  PA.NUS.PPP.05
2005 PPP conversion factor, private consumption (LCU per international $)  PA.NUS.PRVT.PP.05
ARI treatment (% of children under 5 taken to a health provider)  SH.STA.ARIC.ZS
Access to electricity (% of population)  EG.ELC.ACCS.ZS
Access to electricity, rural (% of rural population)  EG.ELC.ACCS.RU.ZS
Access to electricity, urban (% of urban population)  EG.ELC.ACCS.UR.ZS
Access to non-solid fuel (% of population)  EG.NSF.ACCS.ZS
Access to non-solid fuel, rural (% of rural population)  EG.NSF.ACCS.RU.ZS
Access to non-solid fuel, urban (% of urban population)  EG.NSF.ACCS.UR.ZS
Adequacy of social insurance programs (% of total welfare of beneficiary households)  per_si_allsi.adq_pop_tot
Adequacy of social protection and labor programs (% of total welfare of beneficiary households)  per_allsp.adq_pop_tot
Adequacy of social safety net programs (% of total welfare of beneficiary households)  per_sa_allsa.adq_pop_tot
Ad

1343 indicators! Which ones shall I choose?

In [283]:
interesting_indicators = {
    'exposure_percentage': {
        'code': 'EN.ATM.PM25.MC.ZS',
        'name': 'PM2.5 air pollution, population exposed to levels exceeding WHO guideline value (% of total)'},
    'exposure_mean': {
        'code': 'EN.ATM.PM25.MC.M3',
        'name': 'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)' },
    'emission_co2': {
        'code': 'EN.ATM.CO2E.PC',  
        'name': 'CO2 emissions (metric tons per capita)'},
    'emission_co2_tot': {
        'code': 'EN.ATM.CO2E.KT',  
        'name': 'CO2 emissions (kt)'},
    'population': {
        'code': 'SP.POP.TOTL',
        'name': 'population total'},
    'gdp_percap': {
        'code': 'NY.GDP.PCAP.CD',
        'name': 'GDP per capita (current US$)'}
}

Let's make a data frame with only the interesting indicators instead of 1343 of them.

In [284]:
query_string = ' OR '.join(
    ["IndicatorCode='" + interesting_indicators[indicator]['code'] + "'" for indicator in interesting_indicators.keys()]
)

In [285]:
query_string

"IndicatorCode='EN.ATM.PM25.MC.ZS' OR IndicatorCode='SP.POP.TOTL' OR IndicatorCode='EN.ATM.PM25.MC.M3' OR IndicatorCode='NY.GDP.PCAP.CD' OR IndicatorCode='EN.ATM.CO2E.PC'"

In [286]:
data = pd.read_sql_query("SELECT * FROM Indicators " +
                         "WHERE " + query_string, 
                          connection).set_index('CountryCode').sort_index() 

### Extract Uniform Data

Define a function to extract required data for a given country and indicator code.

In [355]:
def get_data(code, country):
    country_data = data.loc[data['CountryName'] == country]
    df = country_data.loc[country_data['IndicatorCode'] == interesting_indicators[code]['code']]
    df = df.rename(columns = {'Value': code}).set_index('Year').sort_index()
    return df[code]

In [359]:
def get_data_for_country(country):
    return pd.concat(
        [get_data(indicator, country).to_frame() for indicator in interesting_indicators], axis=1, join='inner'
    )

In [324]:
all_countries = sorted(data.CountryName.unique())
len(all_countries)

247

In [379]:
records_wanted = get_data_for_country('Canada').index

In [371]:
countries_with_good_record = []
for country in all_countries:
    if (records_wanted.identical(get_data_for_country(country).index)):
        countries_with_good_record.append(country)
        
len(countries_with_good_record)

172

In [373]:
for country in all_countries:
    if country not in countries_with_good_record:
        print (country)

Afghanistan
American Samoa
Andorra
Armenia
Aruba
Azerbaijan
Belarus
Bermuda
Bosnia and Herzegovina
Cambodia
Cayman Islands
Channel Islands
Croatia
Curacao
Czech Republic
Eritrea
Estonia
Europe & Central Asia (all income levels)
Europe & Central Asia (developing only)
Faeroe Islands
French Polynesia
Georgia
Germany
Greenland
Guam
Haiti
High income: nonOECD
Hong Kong SAR, China
Hungary
Iraq
Isle of Man
Kazakhstan
Korea, Dem. Rep.
Kosovo
Kyrgyz Republic
Latvia
Lesotho
Liechtenstein
Lithuania
Low income
Macao SAR, China
Macedonia, FYR
Maldives
Marshall Islands
Micronesia, Fed. Sts.
Middle East & North Africa (developing only)
Moldova
Monaco
Montenegro
Myanmar
New Caledonia
Northern Mariana Islands
Palau
Puerto Rico
Russian Federation
San Marino
Sao Tome and Principe
Serbia
Sint Maarten (Dutch part)
Slovak Republic
Slovenia
Somalia
South Sudan
St. Kitts and Nevis
St. Martin (French part)
Syrian Arab Republic
Tajikistan
Timor-Leste
Turkmenistan
Turks and Caicos Islands
Tuvalu
Ukraine
Uzbekis

### Let's Make a Scatter Plot

In [326]:
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML
from random import randint
init_notebook_mode(connected=True)

In [327]:
def get_random_color():
    r1 = str(randint(0,255))
    r2 = str(randint(0,255))
    r3 = str(randint(0,255))
    return 'rgba(' + r1 + ', ' + r2 + ', ' + r3 + ', .9)'
get_random_color()

'rgba(97, 73, 161, .9)'

https://plot.ly/python/reference/#scatter

In [430]:
x_axis = 'emission_co2'
y_axis = 'exposure_mean'

In [431]:
figure = {
    'data': [], # Array of traces (dictionnaries)
    'frames': [], # Array of frames (dictionnaries)
    'layout': {}, # Dictionnary of layout properties
}

In [461]:
figure['layout'] = {
    'xaxis': {
        'range': [0.02, 2], 
        'autorange': False,
        'type': 'log',
        'title': interesting_indicators[x_axis]['name']
    },
    'yaxis': {
        'range': [0.1, 2],
        'autorange': False,
        'type': 'log',
        'zeroline' : False,
        'title': interesting_indicators[y_axis]['name']

    },
    'hovermode': 'closest',
    'height': 700
}

In [462]:
def make_trace(country):
    trace_data = get_data_for_country(country)
    return {
        'x': trace_data[x_axis].values, 
        'y': trace_data[y_axis].values,
        'name': country,
        'mode': 'lines', 
        'text': ['Year '+ str(trace_data[x_axis].index.values[i]) for i in range(len(trace_data[x_axis].index.values))],
        # Color each line with a random color.
        'line': {'width':2, 'color': get_random_color()},
        # Starting point: show only Canada.
        'visible': 'legendonly' if country != 'Canada' else True
    }

In [463]:
figure['data'] = [
    make_trace(country) for country in countries_with_good_record
]

In [464]:
iplot(figure)

In [513]:
figure['layout'] = {
    'xaxis': {
        'type': 'log',
        'title': interesting_indicators[x_axis]['name']
    },
    'yaxis': {
        'type': 'log',
        'zeroline' : False,
        'title': interesting_indicators[y_axis]['name']

    },
    'hovermode': 'closest',
    'height': 700
}

In [514]:
figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 500, 'redraw': False},
                         'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'quadratic-in-out'}}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]

In [515]:
years = get_data_for_country(country)[x_axis].index.values

figure['layout']['slider'] = {
    'args': [
        'slider.value', {
            'duration': 400,
            'ease': 'cubic-in-out'
        }
    ],
    'initialValue': years[0],
    'plotlycommand': 'animate',
    'values': years,
    'visible': True
}

# Define my_slide to be inserted into the figure
my_slider = {
    'active': 0,
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Year:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration': 300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0,
    'steps': []
}
my_slider['steps'] = [{
    'args': [
        [year],
        {'frame': {'duration': 300, 'redraw': False},
         'mode': 'immediate',
       'transition': {'duration': 300}}
     ],
     'label': year,
     'method': 'animate'
} for year in years]

figure['layout']['sliders'] = [my_slider]


In [533]:
def make_frame(country, index, color):
    trace_data = get_data_for_country(country)
    return {
        'x': [trace_data[x_axis].values[index]], 
        'y': [trace_data[y_axis].values[index]],
        'name': get_region(country),
        'mode': 'markers', 
        # Color each line with a random color.
        'marker': {
            'color': color,
            'sizemode': 'area',
            'sizeref': 500000,
            'size': trace_data['population']
        },
        # Starting point: show only Canada.
        'visible': False if 'income' in country else True
    }

In [534]:
colors = {}
for country in countries_with_good_record:
    colors[country] = get_random_color() 

In [535]:
figure['data'] = [
    make_frame(country, 0, colors[country]) for country in countries_with_good_record
]

Albania
Algeria
Angola
Antigua and Barbuda
Arab World
Argentina
Australia
Austria
Bahamas, The
Bahrain
Bangladesh
Barbados
Belgium
Belize
Benin
Bhutan
Bolivia
Botswana
Brazil
Brunei Darussalam
Bulgaria
Burkina Faso
Burundi
Cabo Verde
Cameroon
Canada
Caribbean small states
Central African Republic
Central Europe and the Baltics
Chad
Chile
China
Colombia
Comoros
Congo, Dem. Rep.
Congo, Rep.
Costa Rica
Cote d'Ivoire


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
figure['frames'] = [
    {
        'data': [
            make_frame(country, index, colors[country]) for country in countries_with_good_record
        ],
        'name': str(years[index])
    } for index in range(len(years))
]

In [None]:
iplot(figure)