# Gapminder data
## The data in this lesson was obtained from the site gapminder.org. The variables included are:

- Aged 15+ Employment Rate (%)
- Life Expectancy (years)
- GDP/capita (US$, inflation adjusted)
- Primary school completion (% of boys)
- Primary school completion (% of girls)

### 1. Importing libraries

In [1]:
# importing numpy and pandas

import numpy as np
import pandas as pd
print('All imports completed!')

All imports completed!


### 2. Reading files

In [2]:
# reading files using pandas

employment_above_15 = pd.read_csv('employment_above_15.csv')
life_expectancy = pd.read_csv('life_expectancy.csv')
gdp_per_capita = pd.read_csv('gdp_per_capita.csv')
female_completion_rate = pd.read_csv('female_completion_rate.csv')
male_completion_rate = pd.read_csv('male_completion_rate.csv')
print('All data is loaded!')

All data is loaded!


### 3. Check out some values!

In [3]:
# print first 20 countries in employments file:

print(employment_above_15['Country'][:20])

0                Afghanistan
1                    Albania
2                    Algeria
3                     Angola
4                  Argentina
5                    Armenia
6                  Australia
7                    Austria
8                 Azerbaijan
9                    Bahamas
10                   Bahrain
11                Bangladesh
12                  Barbados
13                   Belarus
14                   Belgium
15                    Belize
16                     Benin
17                    Bhutan
18                   Bolivia
19    Bosnia and Herzegovina
Name: Country, dtype: object


In [4]:
# print first 20 employment values for 2007 year in employments file:

print(employment_above_15['2007'][:20])

0     55.700001
1     51.400002
2     50.500000
3     75.699997
4     58.400002
5     40.099998
6     61.500000
7     57.099998
8     60.900002
9     66.599998
10    60.400002
11    68.099998
12    66.900002
13    53.400002
14    48.599998
15    56.799999
16    71.599998
17    58.400002
18    70.400002
19    41.200001
Name: 2007, dtype: float64


### 4. Operating with datasets.

#### 4.1. Employment data.

In [5]:
# function that prints out the country 
# with maximum employment in given bunch of data
# for given year
def print_max_country_employment(data, size, year):
    i = data[year][:size].values.argmax()
    country = data['Country'][:size][i]
    value = data[year][:size][i]
    print('Country with maximum employment({}) is {}'.format(value, country))

In [6]:
# for our bunch print maximum value of employment and the country.
print_max_country_employment(employment_above_15, 20, '2007')

Country with maximum employment(75.6999969482) is Angola


#### 4.2. Completion data.

In [7]:
#  this function returns a NumPy array 
# containing the overall school completion rate for each country
# The arguments are NumPy arrays giving the female and male completion 
# of each country in the same order.
def overall_completion_rate(female_completion, male_completion):
    f_c = np.array(female_completion)
    m_c = np.array(male_completion)
    result = (f_c + m_c)/2
    return result

In [8]:
print(overall_completion_rate(female_completion_rate['2007'][:20], male_completion_rate['2007'][:20]))

[       nan        nan        nan        nan  96.416025        nan
        nan        nan        nan        nan 102.644275 101.41129
  93.316285        nan 103.455575  98.148215 102.35113         nan
        nan  91.77855 ]


### 5. Standardizing data.

In [10]:
# Employment data in 2007 for those 20 countries
employment_for_standardizing = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])
country_name = 'United States'

def standardize_data(values):
    '''
    This function returns a standardized version of the given values
    in a NumPy array. Each value is translated into the
    number of standard deviations that value is away from the mean of the data.
    (A positive number indicates a value higher than the mean, and a negative
    number indicates a value lower than the mean.)
    '''
    mean = values.mean()
    std = values.std()
    return (values - mean)/std
print(standardize_data(employment_for_standardizing))

[-0.31965231 -0.780123   -0.87650077  1.82207181 -0.03051941 -1.99019768
  0.30144772 -0.16973184  0.23719615  0.84758731  0.18365304  1.00821665
  0.87971351 -0.56595055 -1.07996476 -0.20185762  1.38301845 -0.03051941
  1.2545153  -1.87240259]


In [11]:
# Now print all countries with positive indicates:
print(countries[standardize_data(employment_for_standardizing) > 0])

['Angola' 'Australia' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Benin' 'Bolivia']


### Pandas series

In [12]:
# describe the data:
employment_above_15.describe()

Unnamed: 0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,57.732022,57.622472,57.396067,57.536517,57.607865,57.531461,57.634832,57.699438,57.703371,57.804495,57.822472,57.747191,57.747753,57.924157,58.108427,58.442135,58.635955
std,11.083858,11.082125,11.154044,11.077853,10.933246,11.108913,11.09066,10.996967,10.974119,10.893329,10.958378,10.901107,10.901185,10.827999,10.807143,10.618519,10.519454
min,29.299999,29.799999,30.4,26.799999,32.599998,29.5,31.0,33.400002,34.400002,33.599998,29.4,26.6,29.4,29.700001,31.4,31.6,32.0
25%,51.025,50.8,50.349999,50.649999,50.149999,50.55,50.400002,50.450001,50.5,50.224998,50.825,50.925001,51.124999,50.750001,50.925001,51.124999,51.225
50%,56.950001,57.299999,56.75,56.700001,56.65,56.15,56.35,56.85,56.9,57.15,57.35,57.0,57.450001,57.6,58.049999,58.400002,58.699999
75%,63.425,63.249999,63.775,64.174997,64.275,64.250002,64.699997,64.674997,64.674997,64.775002,64.775002,64.599998,64.275002,64.424999,64.750002,64.924999,64.975
max,87.5,87.199997,87.0,87.699997,87.300003,86.800003,86.099998,85.900002,85.699997,85.099998,84.300003,84.0,83.599998,83.400002,84.300003,84.099998,83.199997


In [13]:
employment_above_15['1991'].mean()

57.73202246762136