# Welcome to Part 2: data manipulation

### Let's get more involved with the data!

In [None]:
# Load libraries
import pandas as pd

# Read in data
un_data = pd.read_csv('data/UN.csv')

Questions we might like to answer
    * Is fertility different in Africa compared to Europe?
    * Does the GDP per capita vary more across Latin America or Asia?
    * Which countries are we missing data for? Is there a lot of data missing? What should we do?
    * Do the column names make sense or do we want to rename them to something more readable?

### Is fertility different in Africa compared to Europe?

In [None]:
africa_data = un_data[un_data['region'] == 'Africa']
africa_data['fertility'].describe()

In [None]:
europe_data = un_data[un_data['region'] == 'Europe']
europe_data['fertility'].describe()

### Does the GDP per capita vary more across Latin America or Asia?

In [None]:
latin_data = un_data[un_data['region'] == 'Latin Amer']

In [None]:
asia_data = un_data[un_data['region'] == 'Asia']

In [None]:
print("Standard deviation of GDP in Latin America", latin_data['ppgdp'].std())
print("Standard deviation of GDP in Asia", asia_data['ppgdp'].std())

### Missing data

In [None]:
# What is actually being done here? Take your time. Ask if you are not sure
# Take a look at the un_data.head() to see if you are right
un_data['number_nas'] = un_data.isna().sum(axis=1)

In [None]:
max_nas = max(un_data['number_nas'])

The next lines are more complicated. Take your time to understand what is going on here

In [None]:
# What does the range function do?
# Try changing range(5) to range(1, 10, 2)
for i in range(5):
    print(i)

In [None]:
# Why are we using range(max_nas + 1)?
for n_missing in range(max_nas + 1):
    print("There are ", sum(un_data['number_nas'] == n_missing), " countries with ", n_missing, "NaNs.")

In [None]:
print(un_data[un_data['number_nas'] == 1].country)

In [None]:
print(un_data[un_data['number_nas'] == 6].country)

### Rename columns

In [None]:
un_data.columns

In [None]:
un_data.rename(columns = {'fertility': 'fertilityRate', 'ppgdp':'GDPperperson'}, inplace = True)

In [None]:
un_data.head()

In [None]:
un_data.columns = ['country', 'region', 'group', 'fertilityRate', 'GDPpp', 'lifeExpectancy', 
                   'percentageUrban', 'infantMortality', 'numberNas']

In [None]:
un_data.head()

### Saving a dataframe to csv

In [None]:
# We can now save our modified data frame as a new csv file
un_data.to_csv('data/UN_modified.csv')

### What other questions do you want to answer? Have a go now