In [96]:
import pandas as pd
import matplotlib
%matplotlib inline

In [97]:
# Dataframe has index, as well as country and unemp_rate
unemployment_df = pd.read_csv('./data/unemployment_2016.csv')

In [98]:
type(unemployment_df)

pandas.core.frame.DataFrame

In [225]:
countries = unemployment_df['country']

In [226]:
type(countries)

pandas.core.series.Series

In [227]:
unemployment_rates = unemployment_df['unemp_rate']

In [228]:
avg_unemployment = unemployment_rates.mean()

In [229]:
unemployment_sorted_asc = unemployment_df.sort_values('unemp_rate', ascending=True)
# The default is ascending, how do we make it descending?

In [230]:
lowest_unemployment = unemployment_sorted_asc.head()

### What seven countries have the highest unemployment in Europe?

In [231]:
unemployment_sorted_desc = unemployment_df.sort_values('unemp_rate', ascending=False)
highest_unemployment = unemployment_sorted_desc.head(7)

### What is the average unemployment of the seven countries with the highest unemployment in europe

In [232]:
highest_unemployment_avg = highest_unemployment['unemp_rate'].mean()

### Extra - explore these methods:
.min(), .max(), .describe(), .unique(), .filter()

# Combining DataFrames
#### In this section, we'll look at .merge(), .melt()

In [13]:
gdp_df = pd.read_csv('./data/gdp_2016.csv')

In [14]:
# Create new data frame
# We will be using this from here on out
eu_data = pd.merge(unemployment_df, gdp_df)

In [233]:
misc_data = pd.read_excel('./data/misc_data.xlsx')

In [234]:
misc_data_file = pd.ExcelFile('./data/misc_data.xlsx')
sheet_names = misc_data_file.sheet_names

In [19]:
income_df = pd.read_excel('./data/misc_data.xlsx', "Income")

In [235]:
# Merge new income data with existing eu_data
# This is not an inplace merge
eu_data = eu_data.merge(income_df)

In [236]:
# Let's get some more data
pop_df = pd.read_excel('./data/misc_data.xlsx', 'Population', skiprows=3)

In [237]:
pop_df_col_names = pop_df.columns
# total_pop = pop_df['']

In [239]:
total_pop = pop_df['total_pop']

In [240]:
eu_data = eu_data.merge(total_pop)

In [26]:
total_pop_with_countries = pop_df[['country', 'total_pop']]
total_pop_with_countries

Unnamed: 0,country,total_pop
0,Belgium,11000638
1,Bulgaria,7364570
2,Estonia,1294455
3,Ireland,4574888
4,Greece,10816286
5,Spain,46815910
6,France,64933400
7,Croatia,4284889
8,Italy,59433744
9,Czechia,10436560


In [42]:
eu_data = pd.merge(eu_data, total_pop_with_countries)
eu_data

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
0,Austria,6.0,356237.6,23071,8401940
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
3,Switzerland,5.0,605753.7,27692,7954662
4,Cyprus,13.0,18490.2,16173,840407
5,Czechia,4.0,176370.1,12478,10436560
6,Germany,4.1,3159750.0,21152,80219695
7,Denmark,6.2,282089.9,21355,5560628
8,Estonia,6.8,21682.6,11867,1294455
9,Greece,23.6,176487.9,9048,10816286


In [45]:
# Careful - it will overwrite!
eu_data.to_csv('./data/out/eu_data.csv')
# Look at your files. SO cool!

In [49]:
# Uh oh. These are not sorted. Sort them now and rewrite to csv
# Sort your values!
eu_data_sorted = eu_data.sort_values('country')
eu_data_sorted

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
0,Austria,6.0,356237.6,23071,8401940
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
13,Croatia,13.1,46639.5,8985,4284889
4,Cyprus,13.0,18490.2,16173,840407
5,Czechia,4.0,176370.1,12478,10436560
7,Denmark,6.2,282089.9,21355,5560628
8,Estonia,6.8,21682.6,11867,1294455
11,Finland,8.8,216073.0,19997,5375276
12,France,10.1,2228568.0,20621,64933400


In [115]:
eu_data_sorted.reset_index(drop=True, inplace=True)

In [116]:
eu_data_sorted.to_csv('./data/out/eu_data_sorted.csv')

In [117]:
### Use your googling skills! Write this file to json, excel, and html
eu_data_sorted.to_excel('./data/out/eu_data_sorted.xlsx')

In [118]:
# How would you view this json file?
# What similarities does it have with csv and excel? What differences?
eu_data_sorted.to_json('./data/out/eu_data_sorted.json')

In [119]:
# How would you view this json file?
# What similarities does it have with csv and excel? What differences?
eu_data_sorted.to_html('./data/out/eu_data_sorted.htm')

## Working with DataFrames
#### You've created them, now let's use them!

In [130]:
eu_data_final = pd.read_csv('./data/final/eu_data_final.csv', index_col=0)

In [166]:
eu_data_final.head()

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
0,Austria,6.0,356237.6,23071,8401940
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
3,Croatia,13.1,46639.5,8985,4284889
4,Cyprus,13.0,18490.2,16173,840407


In [167]:
eu_data_final.tail()

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
27,Spain,19.6,1118743.0,15347,46815910
28,Sweden,7.0,463147.5,20955,9482855
29,Switzerland,5.0,605753.7,27692,7954662
30,Turkey,10.9,780224.9,6501,7954662
31,United Kingdom,4.8,2403382.6,17296,63182180


In [168]:
eu_data_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32 entries, 0 to 31
Data columns (total 5 columns):
country          32 non-null object
unemp_rate       32 non-null float64
gdp              32 non-null float64
median_income    32 non-null int64
total_pop        32 non-null int64
dtypes: float64(2), int64(2), object(1)
memory usage: 1.5+ KB


In [172]:
eu_data_final.shape

(32, 5)

In [173]:
eu_data_final.describe()

Unnamed: 0,unemp_rate,gdp,median_income,total_pop
count,32.0,32.0,32.0,32.0
mean,8.3375,521833.3,15972.34375,16376120.0
std,4.393378,778184.9,6640.636617,21988190.0
min,3.0,10344.1,4724.0,315556.0
25%,5.7,47756.32,10190.5,3974524.0
50%,7.3,201276.8,16205.0,7954662.0
75%,9.8,498799.0,21161.25,12414430.0
max,23.6,3159750.0,28663.0,80219700.0


#### Data in pandas is vectorized. What does that mean?
It means you can apply operate on entire series with one command. Notice: this does not happen in place

In [174]:
eu_data_final.mean()

unemp_rate       8.337500e+00
gdp              5.218333e+05
median_income    1.597234e+04
total_pop        1.637612e+07
dtype: float64

#### What is boolean indexing?
First of all, what is the boolean data type? A data type that represents one of two possible values. True False, On Off, etc.

In [178]:
austria_bool = eu_data_final.country == 'Austria'
austria_bool

0      True
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: country, dtype: bool

In [132]:
eu_data_final[austria_bool]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
0,Austria,6.0,356237.6,23071,8401940


In [184]:
l_names = eu_data.country.str.contains('l')
l_names

0     False
1      True
2      True
3     False
4     False
5     False
6     False
7     False
8      True
9     False
10    False
11    False
12    False
13     True
14     True
15     True
16    False
17    False
18    False
19     True
20     True
21    False
22     True
23     True
24    False
25     True
26     True
27    False
28    False
29     True
30    False
31    False
Name: country, dtype: bool

In [185]:
eu_data_final[l_names]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
8,Finland,8.8,216073.0,19997,5375276
13,Iceland,3.0,18646.1,22193,315556
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
19,Malta,4.7,10344.1,17264,417432
20,Netherlands,6.0,708337.0,21189,16655799
22,Poland,6.2,426547.5,10865,38044565
23,Portugal,11.2,186480.5,10805,10562178


In [186]:
l_names_insensitive = eu_data.country.str.contains('l', case=False)
l_names_insensitive

0     False
1      True
2      True
3     False
4     False
5     False
6     False
7     False
8      True
9     False
10    False
11    False
12    False
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21    False
22     True
23     True
24    False
25     True
26     True
27    False
28    False
29     True
30    False
31    False
Name: country, dtype: bool

In [187]:
eu_data_final[l_names_insensitive]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
8,Finland,8.8,216073.0,19997,5375276
13,Iceland,3.0,18646.1,22193,315556
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
16,Latvia,9.6,25037.7,9257,2070371
17,Lithuania,7.9,38849.4,9364,3043429
18,Luxembourg,6.3,53303.0,28663,512353
19,Malta,4.7,10344.1,17264,417432


In [188]:
low_unemployment = eu_data_final.unemp_rate < 7

In [190]:
eu_data_final[l_names_insensitive and low_unemployment]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [191]:
eu_data_final[l_names_insensitive & low_unemployment]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
13,Iceland,3.0,18646.1,22193,315556
18,Luxembourg,6.3,53303.0,28663,512353
19,Malta,4.7,10344.1,17264,417432
20,Netherlands,6.0,708337.0,21189,16655799
22,Poland,6.2,426547.5,10865,38044565
29,Switzerland,5.0,605753.7,27692,7954662


In [138]:
eu_data_final[l_names | low_unemployment]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
0,Austria,6.0,356237.6,23071,8401940
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
5,Czechia,4.0,176370.1,12478,10436560
6,Denmark,6.2,282089.9,21355,5560628
7,Estonia,6.8,21682.6,11867,1294455
8,Finland,8.8,216073.0,19997,5375276
10,Germany,4.1,3159750.0,21152,80219695
12,Hungary,5.1,113903.8,8267,9937628
13,Iceland,3.0,18646.1,22193,315556


In [192]:
eu_data_final[~low_unemployment]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
1,Belgium,7.8,424660.3,21335,11000638
2,Bulgaria,7.6,48128.6,6742,7364570
3,Croatia,13.1,46639.5,8985,4284889
4,Cyprus,13.0,18490.2,16173,840407
8,Finland,8.8,216073.0,19997,5375276
9,France,10.1,2228568.0,20621,64933400
11,Greece,23.6,176487.9,9048,10816286
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
16,Latvia,9.6,25037.7,9257,2070371


### What countries have an unemployment rate greater than 9% and a GDP less than 280000?

In [143]:
high_unemployment = eu_data_final.unemp_rate > 10
x_gdp = eu_data_final.gdp > 280000

In [144]:
eu_data_final[high_unemployment & x_gdp]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
9,France,10.1,2228568.0,20621,64933400
15,Italy,11.7,1689824.0,16237,59433744
27,Spain,19.6,1118743.0,15347,46815910
30,Turkey,10.9,780224.9,6501,7954662


### What countries have a median_income between 10000 and 20000?

In [165]:
# Use convenient notation!
income_query = (eu_data_final.median_income > 10_000) & (eu_data_final.median_income < 20_000)

In [163]:
eu_data_final[income_query]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
4,Cyprus,13.0,18490.2,16173,840407
5,Czechia,4.0,176370.1,12478,10436560
7,Estonia,6.8,21682.6,11867,1294455
8,Finland,8.8,216073.0,19997,5375276
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744
19,Malta,4.7,10344.1,17264,417432
22,Poland,6.2,426547.5,10865,38044565
23,Portugal,11.2,186480.5,10805,10562178
25,Slovakia,9.7,81226.1,10466,5397036


In [158]:
income_query_alt = eu_data_final.median_income.between(10_000, 20_000, inclusive=True)

In [157]:
eu_data_final[income_query_alt]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
2,Bulgaria,7.6,48128.6,6742,7364570
3,Croatia,13.1,46639.5,8985,4284889
4,Cyprus,13.0,18490.2,16173,840407
5,Czechia,4.0,176370.1,12478,10436560
7,Estonia,6.8,21682.6,11867,1294455
8,Finland,8.8,216073.0,19997,5375276
11,Greece,23.6,176487.9,9048,10816286
12,Hungary,5.1,113903.8,8267,9937628
14,Ireland,8.4,273238.2,18286,4574888
15,Italy,11.7,1689824.0,16237,59433744


In [294]:
# What if we want multiple rows?
eu_data_final[eu_data_final['country'].isin(['Slovenia', 'United Kingdom'])]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
26,Slovenia,8.0,40357.2,15250,2050189
31,United Kingdom,4.8,2403382.6,17296,63182180


### What countries have a higher unemployment rate than Slovenia and have a 't' or a 'T' in their name?

In [224]:
# slovenia = eu_data_final.country == 'Slovenia'
# slovenia_unemployment_rate = float(eu_data_final[slovenia].unemp_rate)
# unemployment_query = eu_data_final.unemp_rate > slovenia_unemployment_rate
# t_names_query = eu_data_final.country.str.contains('t', case=False)
# final_query = eu_data_final[unemployment_query & t_names]
# final_query

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
3,Croatia,13.1,46639.5,8985,4284889
15,Italy,11.7,1689824.0,16237,59433744
16,Latvia,9.6,25037.7,9257,2070371
23,Portugal,11.2,186480.5,10805,10562178
30,Turkey,10.9,780224.9,6501,7954662


In [246]:
### Now let's look at selection in dataframes, using the existing indexes

In [290]:
# Now we know that we can do this
slovenia = eu_data_final[eu_data_final.country == 'Slovenia']

# But what if we want specific values from the Slovenia row?
# use .loc!
slovenia_unemp_pop = eu_data_final.loc[eu_data_final.country == 'Slovenia', ['unemp_rate', 'total_pop']]

# Also great for finding specific values
slovenia_gdp = eu_data_final.loc[26, 'unemp_rate']

26    8.0
28    7.0
Name: unemp_rate, dtype: float64

In [288]:
# So what is an index anyway?
# It's a lot like a python dictionary
# The values within the row are a lot like a list
countries = {
    15: {'country': 'Italy', 
         'unemp_rate': 11.7, 
         'gdp': 1689824.0, 
         'median_income': 16237, 
         'total_pop': 59433744
        }
}
countries[15]['country']

'Italy'

In [289]:
eu_data_final.loc[15, 'country']

'Italy'

In [270]:
eu_data_final[eu_data_final['country'].isin(['Slovenia', 'United Kingdom'])]

Unnamed: 0,country,unemp_rate,gdp,median_income,total_pop
26,Slovenia,8.0,40357.2,15250,2050189
31,United Kingdom,4.8,2403382.6,17296,63182180


In [291]:
mult_unemp_pop = eu_data_final.loc[eu_data_final.country == 'Slovenia', ['unemp_rate', 'total_pop']]
mult_unemp_pop

Unnamed: 0,unemp_rate,total_pop
26,8.0,2050189


In [None]:
### What countries have a higher unemployment rate than Slovenia and have a 't' or a 'T' in their name?
