# Welcome to Part 2: data manipulation

### Let's get more involved with the data!

In [None]:
# Load libraries
import pandas as pd

# Read in data
un_data = pd.read_csv('data/UN.csv')

Questions we might like to answer

1. Does the GDP per capita vary more across Latin America or Asia?
2. How different is fertility different in Africa compared to Europe?
3. Which countries are we missing data for? Is there a lot of data missing? What should we do?
4. Do the column names make sense or do we want to rename them to something more readable?

### 1.  Does the GDP per capita vary more across Latin America or Asia?

In [None]:
# Here we are selecting all countries that are in the Latin America region
latin_data = un_data[un_data['region'] == 'Latin Amer']
latin_data.head()

In [None]:
# Now select the Asian countries
asia_data = un_data[un_data['region'] == 'Asia']
asia_data.head()

In [None]:
print("Standard deviation of GDP in Latin America", latin_data.ppgdp.std())
print("Standard deviation of GDP in Asia", asia_data.ppgdp.std())

### 2. How different is fertility different in Africa compared to Europe?

In [None]:
# Here we are selecting all countries that are in the Africa region
africa_data = un_data[un_data['region'] == 'Africa']
africa_data.head()

In [None]:
# Now select the equrope data
europe_data = un_data[un_data['region'] == 'Europe']
europe_data.head()

In [None]:
print("Africa:\n", africa_data.fertility.describe())
print("Europe:\n", europe_data.fertility.describe())

In [None]:
# Rather than just compare fertility, we can compare all variables at once between the two continents
print("Africa:\n", africa_data.describe())
print("\nEurope:\n", europe_data.describe())

### 3. Which countries are we missing data for? Is there a lot of data missing? What should we do?

In [None]:
# Selecting the rows of the dataframe that contain NAs values for the region column
un_data[un_data.region.isna()]

In [None]:
un_data.head()

In [None]:
# What is actually being done here? Take your time. Ask if you are not sure
# For further help, take a look at the un_data.head()
un_data['number_nas'] = un_data.isna().sum(axis=1)

In [None]:
# We have added a column called 'number_nas', which contains the number of NAs that appear in that column
un_data.head()

In [None]:
# We find the maximum number of NAs that appear in any column and give that value the name max_nas
max_nas = max(un_data['number_nas'])
print(max_nas)

The next lines are more complicated. Take your time to understand what is going on here

In [None]:
# Here we have another for loop. 
# What does the range function do?
# Try changing range(5) to range(1, 10, 2). What does this do?
for i in range(5):
    print(i)

In [None]:
# Now a for loop based on our data
# Make sure you understand what is going on here. Ask if you are not sure
for n_missing in range(max_nas + 1):
    print("There are ", sum(un_data['number_nas'] == n_missing), " countries with ", n_missing, "NaNs.")

# Why are we using range(max_nas + 1)?

In [None]:
# Here we list the countries that contain a single NA in the row
print(un_data[un_data['number_nas'] == 1].country)

In [None]:
# And let's list the countries that contain a six NAs in the row
print(un_data[un_data['number_nas'] == 6].country)

In [None]:
# Based on these results we probably want to remove all the rows that contain six NAs from our table
# We are modifying the existing dataframe. Note the dimensions before and afterwards. We have removed 14 rows, as expected
print(un_data.shape)
un_data = un_data[un_data.number_nas != 6] # The '!=' operator mean 'not equal to' 
print(un_data.shape)

### 4. Do the column names make sense or do we want to rename them to something more readable?

In [None]:
# Lets look at the column names
un_data.columns

# We can see that some of them are not very well named, so maybe we want to rename them

In [None]:
# We can rename columns of the dataframe by specifying the old and new columns of the ones we want to rename
un_data.rename(columns = {'fertility':'fertilityRate', 'ppgdp':'GDPperperson'}, inplace = True)

In [None]:
# You can see that the column names have now changed
un_data.head()

In [None]:
# If we want to rename many columns at once we may want specify an array of all the column names (in order)
un_data.columns = ['country', 'region', 'group', 'fertilityRate', 'GDPpp', 'lifeExpectancy', 
                   'percentageUrban', 'infantMortality', 'numberNas']

In [None]:
un_data.head()

### Saving a dataframe to csv

In [None]:
# We can now save our modified data frame as a new csv file
un_data.to_csv('data/UN_modified.csv')

### Take few minutes to try running some lines of code to explore the data further. Don't hesitate to ask your instructor for any further help.