In [2]:
import pandas as pd
import matplotlib
%matplotlib inline

In [3]:
# Dataframe has index, as well as country and unemp_rate
unemployment_df = pd.read_csv('./data/unemployment_2016.csv')

In [4]:
type(unemployment_df)

pandas.core.frame.DataFrame

In [5]:
countries = unemployment_df['country']

In [6]:
type(countries)

pandas.core.series.Series

In [7]:
unemployment_rates = unemployment_df['unemp_rate']

In [8]:
avg_unemployment = unemployment_rates.mean()

In [9]:
unemployment_sorted_asc = unemployment_df.sort_values('unemp_rate', ascending=True)
# The default is ascending, how do we make it descending?

In [10]:
lowest_unemployment = unemployment_sorted_asc.head()

### What seven countries have the highest unemployment in Europe?

In [11]:
unemployment_sorted_desc = unemployment_df.sort_values('unemp_rate', ascending=False)
highest_unemployment = unemployment_sorted_desc.head(7)

### What is the average unemployment of the seven countries with the highest unemployment in europe

In [12]:
highest_unemployment_avg = highest_unemployment['unemp_rate'].mean()

### Extra - explore these methods:
.min(), .max(), .filter()

In [109]:
unemployment_rates.min()
unemployment_rates.max()

23.6

# Combining DataFrames
#### In this section, we'll look at .merge(), .melt()

In [111]:
gdp_df = pd.read_csv('./data/gdp_2016.csv')

In [112]:
# Create new data frame
# We will be using this from here on out
eur_data = pd.merge(unemployment_df, gdp_df)

In [17]:
misc_data = pd.read_excel('./data/misc_data.xlsx')

In [113]:
misc_data_file = pd.ExcelFile('./data/misc_data.xlsx')
sheet_names = misc_data_file.sheet_names

In [114]:
income_df = pd.read_excel('./data/misc_data.xlsx', "Income")

In [115]:
# Merge new income data with existing eu_data
# This is not an inplace merge
eur_data = eur_data.merge(income_df)

In [116]:
# Let's get some more data
pop_df = pd.read_excel('./data/misc_data.xlsx', 'Population', skiprows=3)

In [117]:
pop_df_col_names = pop_df.columns
# total_pop = pop_df['']

In [118]:
total_pop = pop_df['total_pop']

In [119]:
eur_data = eur_data.merge(total_pop)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [120]:
total_pop_with_countries = pop_df[['country', 'total_pop']]

In [121]:
eur_data = pd.merge(eur_data, total_pop_with_countries)

In [123]:
# Uh oh. These are not sorted. Sort them now and rewrite to csv
eur_data_sorted = eur_data.sort_values('country')


In [None]:
# But wait, what's going on with the indexes?
# What is an index anyway?

In [31]:
# So what is an index anyway?
# It's a lot like a python dictionary
countries = {
    15: {'country': 'Italy', 
         'unemp_rate': 11.7, 
         'gdp': 1689824.0, 
         'median_income': 16237, 
         'total_pop': 59433744
        }
}

In [125]:
eur_index = eur_data_sorted.index
type(eur_index)

pandas.core.indexes.numeric.Int64Index

In [None]:
# Index is immutable
eur_index[0] = 12

In [32]:
# For our purposes, let's reset our indexes
eur_data_sorted.reset_index(drop=True, inplace=True)

In [126]:
eur_data_sorted.to_csv('./data/out/eur_data_sorted.csv')

In [127]:
### Use your googling skills! Write this file to json, excel, and html
eur_data_sorted.to_excel('./data/out/eur_data_sorted.xlsx')

In [128]:
# How would you view this json file?
# What similarities does it have with csv and excel? What differences?
eur_data_sorted.to_json('./data/out/eur_data_sorted.json')

In [129]:
# How would you view this json file?
# What similarities does it have with csv and excel? What differences?
eur_data_sorted.to_html('./data/out/eur_data_sorted.htm')

## Working with DataFrames
#### You've created them, now let's use them!

In [100]:
eur_data_final = pd.read_csv('./data/final/eur_data_final.csv', index_col=0)

In [130]:
eur_data_final.head()

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
0,Austria,6.0,356237.6,23071,8401940
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
3,Croatia,13.1,46639.5,8985,4284889
4,Cyprus,13.0,18490.2,16173,840407


In [131]:
eur_data_final.tail()

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
27,Spain,19.6,1118743.0,15347,46815910
28,Sweden,7.0,463147.5,20955,9482855
29,Switzerland,5.0,605753.7,27692,7954662
30,Turkey,10.9,780224.9,6501,7954662
31,United Kingdom,4.8,2403382.6,17296,63182180


In [132]:
eur_data_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32 entries, 0 to 31
Data columns (total 5 columns):
country          32 non-null object
unemp_rate       32 non-null float64
gdp              32 non-null float64
median_income    32 non-null int64
total_pop        32 non-null int64
dtypes: float64(2), int64(2), object(1)
memory usage: 2.8+ KB


In [133]:
# This is a tuple. Why do you think it's a tuple?
eur_data_final.shape

(32, 5)

In [134]:
eur_data_final.describe()

Unnamed: 0,unemp_rate,gdp,median_income,total_pop
count,32.0,32.0,32.0,32.0
mean,8.3375,521833.3,15972.34375,16376120.0
std,4.393378,778184.9,6640.636617,21988190.0
min,3.0,10344.1,4724.0,315556.0
25%,5.7,47756.32,10190.5,3974524.0
50%,7.3,201276.8,16205.0,7954662.0
75%,9.8,498799.0,21161.25,12414430.0
max,23.6,3159750.0,28663.0,80219700.0


#### Data in pandas is vectorized. What does that mean?
It means you can apply operate on entire series with one command. Notice: this does not happen in place

In [136]:
# Before we were using the mean method on a series, now it's the whole dataframe
eur_data_final.mean()

unemp_rate       8.337500e+00
gdp              5.218333e+05
median_income    1.597234e+04
total_pop        1.637612e+07
dtype: float64

#### What is boolean indexing?
First of all, what is the boolean data type? A data type that represents one of two possible values. True False, On Off, etc.

In [139]:
# What's going on here? 
# We are generating a vector of boolean values, based on the truthiness of the expression 
# as it compares to each value in the column
austria_bool = eur_data_final.country == 'Austria'

In [140]:
# We can then pass in that vector
eur_data_final[austria_bool]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
0,Austria,6.0,356237.6,23071,8401940


In [53]:
# We can also do it this way
greece_bool = eur_data_final['country'] == 'Greece'
eur_data_final[greece_bool]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
11,Greece,23.6,176487.9,9048,10816286


In [156]:
# You can also check by partial match
l_names = eur_data_final.country.str.contains('l')

In [157]:
eur_data_final[l_names]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
8,Finland,8.8,216073.0,19997,5375276
13,Iceland,3.0,18646.1,22193,315556
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
19,Malta,4.7,10344.1,17264,417432
20,Netherlands,6.0,708337.0,21189,16655799
22,Poland,6.2,426547.5,10865,38044565
23,Portugal,11.2,186480.5,10805,10562178


In [158]:
# Something is missing!
l_names_insensitive = eur_data_final.country.str.contains('l', case=False)

In [159]:
eur_data_final[l_names_insensitive]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
8,Finland,8.8,216073.0,19997,5375276
13,Iceland,3.0,18646.1,22193,315556
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
16,Latvia,9.6,25037.7,9257,2070371
17,Lithuania,7.9,38849.4,9364,3043429
18,Luxembourg,6.3,53303.0,28663,512353
19,Malta,4.7,10344.1,17264,417432


In [174]:
# What if we want to combine multiple queries?
low_unemployment = eur_data_final.unemp_rate < 7

In [175]:
eur_data_final[l_names_insensitive and low_unemployment]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [171]:
# In set theory, this is intersection.
eur_data_final[l_names_insensitive & low_unemployment]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
13,Iceland,3.0,18646.1,22193,315556
19,Malta,4.7,10344.1,17264,417432
29,Switzerland,5.0,605753.7,27692,7954662


In [172]:
# How about or? In set theory, this is union.
eur_data_final[l_names | low_unemployment]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
5,Czechia,4.0,176370.1,12478,10436560
8,Finland,8.8,216073.0,19997,5375276
10,Germany,4.1,3159750.0,21152,80219695
12,Hungary,5.1,113903.8,8267,9937628
13,Iceland,3.0,18646.1,22193,315556
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
19,Malta,4.7,10344.1,17264,417432


In [176]:
eur_data_final[~low_unemployment]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
3,Croatia,13.1,46639.5,8985,4284889
4,Cyprus,13.0,18490.2,16173,840407
8,Finland,8.8,216073.0,19997,5375276
9,France,10.1,2228568.0,20621,64933400
11,Greece,23.6,176487.9,9048,10816286
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
16,Latvia,9.6,25037.7,9257,2070371


### What countries have an unemployment rate greater than 9% and a GDP less than 280000?

In [178]:
# Start by making your queries
high_unemployment = eur_data_final.unemp_rate > 10
gdp_query = eur_data_final.gdp > 280000

In [179]:
# Then combine them. Is it an and, or, or not?
eur_data_final[high_unemployment & gdp_query]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
9,France,10.1,2228568.0,20621,64933400
15,Italy,11.7,1689824.0,16237,59433744
27,Spain,19.6,1118743.0,15347,46815910
30,Turkey,10.9,780224.9,6501,7954662


### What countries have a median_income between 10000 and 20000?

In [182]:
# Consider combining queries that are similar
# Also, use convenient notation!
income_query = (eur_data_final.median_income > 10_000) & (eur_data_final.median_income < 20_000)

In [183]:
eur_data_final[income_query]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
4,Cyprus,13.0,18490.2,16173,840407
5,Czechia,4.0,176370.1,12478,10436560
7,Estonia,6.8,21682.6,11867,1294455
8,Finland,8.8,216073.0,19997,5375276
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
19,Malta,4.7,10344.1,17264,417432
22,Poland,6.2,426547.5,10865,38044565
23,Portugal,11.2,186480.5,10805,10562178
25,Slovakia,9.7,81226.1,10466,5397036


In [185]:
# Alternative way
income_query_alt = eur_data_final.median_income.between(10_000, 20_000, inclusive=True)

In [186]:
eur_data_final[income_query_alt]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
4,Cyprus,13.0,18490.2,16173,840407
5,Czechia,4.0,176370.1,12478,10436560
7,Estonia,6.8,21682.6,11867,1294455
8,Finland,8.8,216073.0,19997,5375276
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
19,Malta,4.7,10344.1,17264,417432
22,Poland,6.2,426547.5,10865,38044565
23,Portugal,11.2,186480.5,10805,10562178
25,Slovakia,9.7,81226.1,10466,5397036


In [246]:
### Now let's look at selection in dataframes, using the existing indexes.

In [187]:
# So what is an index anyway?
# It's a lot like a python dictionary
countries = {
    15: {'country': 'Italy', 
         'unemp_rate': 11.7, 
         'gdp': 1689824.0, 
         'median_income': 16237, 
         'total_pop': 59433744
        }
}

In [188]:
# With vanilla python, how do we get the word 'Italy' from a dictionary?
countries[15]['country']

'Italy'

In [189]:
# Now how do we get the word 'Italy' from a pandas dataframe?
# Let's use loc!
eur_data_final.loc[15, 'country']

'Italy'

In [190]:
# So what is loc? 
# It is short for locate and is a very dynamic method. One of the reasons that I find it hard to understand.
# In the case above, it is used like a grid. We go by row index, then by column name.
eur_data_final.loc[12, 'gdp']

113903.8

In [191]:
# You can also return the whole row.
eur_data_final.loc[12]

country          Hungary
unemp_rate           5.1
gdp               113904
median_income       8267
total_pop        9937628
Name: 12, dtype: object

In [194]:
# Or you can select parts of a row
eur_data_final.loc[8, ['gdp', 'total_pop']]

gdp           216073
total_pop    5375276
Name: 8, dtype: object

In [195]:
eur_data_final.loc[[9,10], ['country','gdp']]

Unnamed: 0,country,gdp
9,France,2228568.0
10,Germany,3159750.0


In [203]:
# But it can changed with special operations
eur_data_country_index = eur_data_final.set_index('country')
eur_data_country_index
# Notice how it's on a lower line and there are no more numbers on the left!

Unnamed: 0_level_0,unemp_rate,gdp,median_income,total_pop
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austria,6.0,356237.6,23071,8401940
Belgium,7.8,424660.3,21335,11000638
Bulgaria,7.6,48128.6,6742,7364570
Croatia,13.1,46639.5,8985,4284889
Cyprus,13.0,18490.2,16173,840407
Czechia,4.0,176370.1,12478,10436560
Denmark,6.2,282089.9,21355,5560628
Estonia,6.8,21682.6,11867,1294455
Finland,8.8,216073.0,19997,5375276
France,10.1,2228568.0,20621,64933400


In [205]:
eur_data_country_index.loc['Slovenia']

unemp_rate             8.0
gdp                40357.2
median_income      15250.0
total_pop        2050189.0
Name: Slovenia, dtype: float64

### What countries have a higher unemployment rate than Slovenia and have a lowercase 't' in their name?

In [206]:
slovenia_unemployment = eur_data_country_index.loc['Slovenia', 'unemp_rate']
gt_slov = eur_data_final.unemp_rate > slovenia_unemployment
t_names = eur_data_final.country.str.contains('t')
eur_data_final[gt_slov & t_names]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
3,Croatia,13.1,46639.5,8985,4284889
15,Italy,11.7,1689824.0,16237,59433744
16,Latvia,9.6,25037.7,9257,2070371
23,Portugal,11.2,186480.5,10805,10562178
