# Welcome to the tutorial!

## This is Part 1.

### Let's start by exploring some data

In [13]:
# This is comment in Python. Anything written after a '#' will not be executed
# Follow the script, read the comments and play around

In [14]:
# At the start of the script we import modules that we will need later
# pandas is a very useful python data analysis library
# The 'as' allows us to refer to it by a shorter name elsewhere in the code

import pandas as pd

### Reading in the data

In [15]:
# Now we read in the data file
un_data = pd.read_csv('data/UN.csv')

### Exploring the structure of the dataframe

In [16]:
type(un_data)

pandas.core.frame.DataFrame

In [17]:
un_data.head()

Unnamed: 0,country,region,group,fertility,ppgdp,lifeExpF,pctUrban,infantMortality
0,Afghanistan,Asia,other,5.968,499.0,49.49,23.0,124.535
1,Albania,Europe,other,1.525,3677.2,80.4,53.0,16.561
2,Algeria,Africa,africa,2.142,4473.0,75.0,67.0,21.458
3,American Samoa,,,,,,,11.293887
4,Angola,Africa,africa,5.135,4321.9,53.17,59.0,96.191


In [18]:
un_data.describe()

Unnamed: 0,fertility,ppgdp,lifeExpF,pctUrban,infantMortality
count,199.0,199.0,199.0,199.0,207.0
mean,2.761383,13011.951759,72.293193,57.929648,29.439531
std,1.339589,18412.443368,10.123784,23.429565,28.748433
min,1.134,114.8,48.11,11.0,1.916
25%,1.7535,1282.95,65.66,39.0,7.019
50%,2.262,4684.5,75.89,59.0,19.007
75%,3.5445,15520.5,79.585,75.0,44.4775
max,6.925,105095.4,87.12,100.0,124.535


In [19]:
un_data.shape

(213, 8)

In [22]:
list(un_data.columns)

pandas.core.indexes.base.Index

### Subsetting the dataframe

In [26]:
# Subset parts of the pandas dataframe
gdp = un_data['ppgdp']

In [31]:
pctUrban = un_data.pctUrban

pandas.core.series.Series

In [32]:
fertility = un_data.iloc[:, 3]  # This gets the 4th column because Python indexing starts at 0

In [33]:
first_bit = un_data.iloc[0:4, 0:4]
print(first_bit)

          country  region   group  fertility
0     Afghanistan    Asia   other      5.968
1         Albania  Europe   other      1.525
2         Algeria  Africa  africa      2.142
3  American Samoa     NaN     NaN        NaN


In [34]:
print("The second row:")
print(un_data.iloc[1])

The second row:
country            Albania
region              Europe
group                other
fertility            1.525
ppgdp               3677.2
lifeExpF              80.4
pctUrban                53
infantMortality     16.561
Name: 1, dtype: object


In [35]:
print("\nThe 3rd row and 4th column:")
print(un_data.iloc[2, 3])


The 3rd row and 4th column:
2.142


In [36]:
reduced = un_data[['country', 'ppgdp', 'fertility']]
print(reduced.head())

          country   ppgdp  fertility
0     Afghanistan   499.0      5.968
1         Albania  3677.2      1.525
2         Algeria  4473.0      2.142
3  American Samoa     NaN        NaN
4          Angola  4321.9      5.135


In [56]:
print(un_data[un_data.country == 'Spain'])
un_data.country == 'Spain'

    country  region  group  fertility    ppgdp  lifeExpF  pctUrban  \
176   Spain  Europe  other      1.504  30542.8     84.76      78.0   

     infantMortality  
176            3.573  


0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
183    False
184    False
185    False
186    False
187    False
188    False
189    False
190    False
191    False
192    False
193    False
194    False
195    False
196    False
197    False
198    False
199    False
200    False
201    False
202    False
203    False
204    False
205    False
206    False
207    False
208    False
209    False
210    False
211    False
212    False
Name: country, Length: 213, dtype: bool

### Exploring the data further

In [38]:
gdp.isna().sum()

14

In [44]:
unique_regions = un_data.region.unique()
print(unique_regions)

['Asia' 'Europe' 'Africa' nan 'Caribbean' 'Latin Amer' 'Oceania'
 'North America' 'NorthAtlantic']


In [43]:
unique_regions = unique_regions[~pd.isna(unique_regions)]
print(unique_regions)

['Asia' 'Europe' 'Africa' 'Caribbean' 'Latin Amer' 'Oceania'
 'North America' 'NorthAtlantic']


In [45]:
for region in unique_regions:
    print(region)

Asia
Europe
Africa
nan
Caribbean
Latin Amer
Oceania
North America
NorthAtlantic


In [52]:
# Is this different to the lines above? Can you see from the error why this is?
for region in unique_regions:
print(region)

Asia
Europe
Africa
nan
Caribbean
Latin Amer
Oceania
North America
NorthAtlantic
