# Welcome to Part 2: data manipulation

### Let's get more involved with the data!

In [1]:
# Load libraries
import pandas as pd

# Read in data
un_data = pd.read_csv('data/UN.csv')

Questions we might like to answer
    * Is fertility different in Africa compared to Europe?
    * Does the GDP per capita vary more across Latin America or Asia?
    * Which countries are we missing data for? Is there a lot of data missing? What should we do?
    * Do the column names make sense or do we want to rename them to something more readable?

### Is fertility different in Africa compared to Europe?

In [26]:
africa_data = un_data[un_data['region'] == 'Africa']
africa_data['fertility'].describe()

count    53.000000
mean      4.236170
std       1.303722
min       1.590000
25%       3.174000
50%       4.423000
75%       5.078000
max       6.925000
Name: fertility, dtype: float64

In [27]:
europe_data = un_data[un_data['region'] == 'Europe']
europe_data['fertility'].describe()

count    39.000000
mean      1.590026
std       0.231152
min       1.134000
25%       1.453500
50%       1.506000
75%       1.748000
max       2.098000
Name: fertility, dtype: float64

### Does the GDP per capita vary more across Latin America or Asia?

In [28]:
latin_data = un_data[un_data['region'] == 'Latin Amer']

In [29]:
asia_data = un_data[un_data['region'] == 'Asia']

In [30]:
print("Standard deviation of GDP in Latin America", latin_data['ppgdp'].std())
print("Standard deviation of GDP in Asia", asia_data['ppgdp'].std())

Standard deviation of GDP in Latin America 3775.107734883031
Standard deviation of GDP in Asia 16741.768226195738


### Missing data

In [31]:
# What is actually being done here? Take your time. Ask if you are not sure
# Take a look at the un_data.head() to see if you are right
un_data['number_nas'] = un_data.isna().sum(axis=1)

In [32]:
max_nas = max(un_data['number_nas'])

The next lines are more complicated. Take your time to understand what is going on here

In [33]:
# What does the range function do?
# Try changing range(5) to range(1, 10, 2)
for i in range(5):
    print(i)

0
1
2
3
4


In [34]:
# Why are we using range(max_nas + 1)?
for n_missing in range(max_nas + 1):
    print("There are ", sum(un_data['number_nas'] == n_missing), " countries with ", n_missing, "NaNs.")

There are  193  countries with  0 NaNs.
There are  6  countries with  1 NaNs.
There are  0  countries with  2 NaNs.
There are  0  countries with  3 NaNs.
There are  0  countries with  4 NaNs.
There are  0  countries with  5 NaNs.
There are  14  countries with  6 NaNs.


In [35]:
print(un_data[un_data['number_nas'] == 1].country)

5            Anguilla
20            Bermuda
34     Cayman Islands
53           Dominica
74          Greenland
168        Seychelles
Name: country, dtype: object


In [36]:
print(un_data[un_data['number_nas'] == 6].country)

3                    American Samoa
37                  Channel Islands
66                    French Guiana
76                       Guadeloupe
77                             Guam
118                      Martinique
121                         Mayotte
140                            Niue
142        Northern Mariana Islands
158                         Reunion
190                         Tokelau
202    United States Virgin Islands
208       Wallis and Futuna Islands
209                  Western Sahara
Name: country, dtype: object


### Rename columns

In [37]:
un_data.columns

Index(['country', 'region', 'group', 'fertility', 'ppgdp', 'lifeExpF',
       'pctUrban', 'infantMortality', 'number_nas'],
      dtype='object')

In [38]:
un_data.rename(columns = {'fertility': 'fertilityRate', 'ppgdp':'GDPperperson'}, inplace = True)

In [39]:
un_data.head()

Unnamed: 0,country,region,group,fertilityRate,GDPperperson,lifeExpF,pctUrban,infantMortality,number_nas
0,Afghanistan,Asia,other,5.968,499.0,49.49,23.0,124.535,0
1,Albania,Europe,other,1.525,3677.2,80.4,53.0,16.561,0
2,Algeria,Africa,africa,2.142,4473.0,75.0,67.0,21.458,0
3,American Samoa,,,,,,,11.293887,6
4,Angola,Africa,africa,5.135,4321.9,53.17,59.0,96.191,0


In [40]:
un_data.columns = ['country', 'region', 'group', 'fertilityRate', 'GDPpp', 'lifeExpectancy', 
                   'percentageUrban', 'infantMortality', 'numberNas']

In [41]:
un_data.head()

Unnamed: 0,country,region,group,fertilityRate,GDPpp,lifeExpectancy,percentageUrban,infantMortality,numberNas
0,Afghanistan,Asia,other,5.968,499.0,49.49,23.0,124.535,0
1,Albania,Europe,other,1.525,3677.2,80.4,53.0,16.561,0
2,Algeria,Africa,africa,2.142,4473.0,75.0,67.0,21.458,0
3,American Samoa,,,,,,,11.293887,6
4,Angola,Africa,africa,5.135,4321.9,53.17,59.0,96.191,0


### Saving a dataframe to csv

In [None]:
# We can now save our modified data frame as a new csv file
un_data.to_csv('data/UN_modified.csv')

### What other questions do you want to answer? Have a go now