In [37]:
# libraries
import pandas as pd
from bokeh.io import output_notebook, show, curdoc
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource

### Exploratory Data Analysis

In [3]:
# read csv file
data = pd.read_csv("gapminder_tidy.csv")

In [4]:
data.head()

Unnamed: 0,Country,Year,fertility,life,population,child_mortality,gdp,region
0,Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0,South Asia
1,Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0,South Asia
2,Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0,South Asia
3,Afghanistan,1967,7.671,35.17,11163656.0,323.3,1173.0,South Asia
4,Afghanistan,1968,7.671,35.674,11411022.0,318.1,1187.0,South Asia


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10111 entries, 0 to 10110
Data columns (total 8 columns):
Country            10111 non-null object
Year               10111 non-null int64
fertility          10100 non-null float64
life               10111 non-null float64
population         10108 non-null float64
child_mortality    9210 non-null float64
gdp                9000 non-null float64
region             10111 non-null object
dtypes: float64(5), int64(1), object(2)
memory usage: 632.1+ KB


In [6]:
# make the year index for the data.
data = data.set_index("Year")

In [7]:
# to make exploratory data analysis by making a simple plot of Life Expectancy vs Fertility for the year 1970.
# Make the ColumnDataSource: source
source = ColumnDataSource(data={
    'x'       : data.loc[1970].fertility,
    'y'       : data.loc[1970].life,
    'country' : data.loc[1970].Country,
})

# Create the figure: p
p = figure(title='1970', 
           x_axis_label='Fertility (children per woman)', 
           y_axis_label='Life Expectancy (years)',
           plot_height=400, plot_width=700,
           tools=[HoverTool(tooltips='@country')])

# Add a circle glyph to the figure p
p.circle(x='x', y='y', source=source)

# Output the file and show the figure
output_notebook()
show(p)

In [8]:
# What about Turkey?
data[(data.index == 1970) & (data.Country == 'Turkey')][["fertility", "life"]]

Unnamed: 0_level_0,fertility,life
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1970,5.563,52.291


## Application
<br>Application code is in <b>gapminder.py</b> at C:\Users\User\Desktop\Data Visualization\Bokeh\case_study_application

In [9]:
bool_result = pd.notnull(data.loc[2010, "fertility"]) & pd.notnull(data.loc[2010, "life"])
bool_result

Year
2010    True
2010    True
2010    True
2010    True
2010    True
        ... 
2010    True
2010    True
2010    True
2010    True
2010    True
Length: 202, dtype: bool

In [10]:
data_yr = data.loc[2010]
country_list=data_yr[bool_result].Country.unique().tolist()
len(country_list)

202

In [19]:
data[data["population"] == data["population"].min()]

Unnamed: 0_level_0,Country,fertility,life,population,child_mortality,gdp,region
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006,Tokelau,,69.0,1170.0,,,East Asia & Pacific


In [21]:
import numpy as np

np.arange(1970, 2011)

array([1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010])

In [23]:
data.index.max()

2013

In [29]:
data[data.Country == 'Korea, Rep.'].index.to_list()

[1964,
 1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1973,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013]

In [45]:
len(data.Country.unique())

204

In [44]:
all_years = np.arange(1970, 2011)
#country_years = data[data.Country == 'Korea, Rep.'].index.to_list()
arr = [1964,
 1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1999,
 2000,
 2001,
 2002,
 2003,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013]
len(arr)

47

In [35]:
set_arr = set(arr)
set_all_years = set(all_years)

In [36]:
set_all_years - set_arr

{1973, 1998, 2004}

In [43]:
x = [1900,1950,2000]
y = [200,'asdad',500]
plot = figure()
plot.circle(x,y)
show(plot)

In [46]:
min(data.index), max(data.index)

(1964, 2013)

In [47]:
min(data.population), max(data.population)

(1170.0, 1359368470.0)

In [70]:
data[data.Country == 'Tokelau'].population

Year
2006    1170.0
Name: population, dtype: float64

In [63]:
country_list = data.Country.unique().tolist()
country_list

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Rep.',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Rep.',
 'Denmark',
 'Djibouti',
 'Dominican Rep.',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guinea',
 'G

In [74]:
for country in country_list:
    if data[data.Country == country].population.count() < 50:
        print(country)

Tokelau
Åland


In [66]:
data[data.Country == 'Germany'].population.count()

50

In [78]:
country_list.remove('Tokelau')

In [89]:
all_countries = data.Country.unique().tolist()
all_countries.remove('Tokelau')
print(all_countries)

['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Central African Rep.', 'Chad', 'Channel Islands', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Rep.', 'Denmark', 'Djibouti', 'Dominican Rep.', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong, China', 'Hungary

In [86]:
all_countries

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Rep.',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Rep.',
 'Denmark',
 'Djibouti',
 'Dominican Rep.',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guinea',
 'G

In [95]:
ind = [2,3,5]
all_countries = np.array(all_countries)
all_countries[ind].tolist()

['Algeria', 'Angola', 'Argentina']

In [101]:
name=str(2) + "plot_pop" + "_line"
name

'2plot_pop_line'

In [109]:
(set([2,3,5,6]) - set([2,3,5])).pop()

6

In [112]:
a= [1,2,3]
b=[4,5,6]
a

[1, 2, 3]

In [113]:
b

[4, 5, 6]

In [114]:
b=a

In [115]:
a

[1, 2, 3]

In [116]:
b

[1, 2, 3]

In [120]:
all_countries = data.Country.unique().tolist()
all_countries.remove('Tokelau') 
all_countries.remove('Åland') 
all_countries = np.array(all_countries)
curr_selected_countries = all_countries[[0,2,3]].tolist()
curr_selected_countries

['Afghanistan', 'Algeria', 'Angola']

In [131]:
min_pop = []
max_pop = []
for country in curr_selected_countries:
    min_pop.append(min(data[data.Country == country].population))
    max_pop.append(max(data[data.Country == country].population))

In [136]:
max(max_pop)

36983924.0

In [141]:
data[(data.Country == 'Germany')].population

Year
1964    75363329.0
1965    75963700.0
1966    76518660.0
1967    77026811.0
1968    77477619.0
1969    77860668.0
1970    78169287.0
1971    78398250.0
1972    78549958.0
1973    78635758.0
1974    78672415.0
1975    78673559.0
1976    78648816.0
1977    78600404.0
1978    78526576.0
1979    78422458.0
1980    78288577.0
1981    78124452.0
1982    77944929.0
1983    77784306.0
1984    77686865.0
1985    77684875.0
1986    77783439.0
1987    77974722.0
1988    78260173.0
1989    78637382.0
1990    79098094.0
1991    79651903.0
1992    80282985.0
1993    80926118.0
1994    81495194.0
1995    81929441.0
1996    82200333.0
1997    82326019.0
1998    82350467.0
1999    82341545.0
2000    82349027.0
2001    82384256.0
2002    82432026.0
2003    82484111.0
2004    82524343.0
2005    82540739.0
2006    82536138.0
2007    82516297.0
2008    82475271.0
2009    82405365.0
2010    82302465.0
2011    82162512.0
2012    81990837.0
2013    81804228.0
Name: population, dtype: float64

In [150]:
from bokeh.palettes import Spectral10

Spectral10

['#5e4fa2',
 '#3288bd',
 '#66c2a5',
 '#abdda4',
 '#e6f598',
 '#fee08b',
 '#fdae61',
 '#f46d43',
 '#d53e4f',
 '#9e0142']

In [184]:
from numpy import random
x = random.randint(10)
x

1