In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

In [3]:
url = "https://raw.githubusercontent.com/edlich/eternalrepo/master/DS-WAHLFACH/countries.csv"
countries = pd.read_csv(url)

In [4]:
display(countries)

Unnamed: 0,Name,People,Area,BIP,Currency
0,Germany,82521653,357385,3466,EUR
1,Japan,126045000,377835,4938,YEN
2,Canada,36503097,9984670,1529,CAD
3,Italy,60501718,301338,1850,EUR
4,Brazilia,208360000,8515770,1798,REAL


In [5]:
people_sum = countries['People'].sum()
people_cum_sum = countries['People'].cumsum()
print("The sum of all people in all 5 countries is: " + str(people_sum))
print("The cumulative sums moving from the top are:")
display(people_cum_sum)

The sum of all people in all 5 countries is: 513931468
The cumulative sums moving from the top are:


0     82521653
1    208566653
2    245069750
3    305571468
4    513931468
Name: People, dtype: int64

In [6]:
bip_arith_mean = countries['BIP'].mean()
bip_median = countries['BIP'].median()
print("The arithmetic mean of the BIP of all countries is: " + str(bip_arith_mean))
print("The median of the BIP of all countries is: " + str(bip_median))

The arithmetic mean of the BIP of all countries is: 2716.2
The median of the BIP of all countries is: 1850.0


In [7]:
countries_means = countries.mean(axis=0)
print("And here is the mean of each column:")
display(countries_means)

And here is the mean of each column:


People    102786293.6
Area        3907399.6
BIP            2716.2
dtype: float64

In [8]:
print("Basic statistic information for a column, e.g. Area, can be shown all at once:")
area_description = countries['Area'].describe()
display(area_description)

Basic statistic information for a column, e.g. Area, can be shown all at once:


count    5.000000e+00
mean     3.907400e+06
std      4.904957e+06
min      3.013380e+05
25%      3.573850e+05
50%      3.778350e+05
75%      8.515770e+06
max      9.984670e+06
Name: Area, dtype: float64

In [9]:
countries_corr = countries.corr()
countries_cov = countries.cov()
print("The correlation and covariance matrices look like this:")
print("Covariance:")
display(countries_cov)
print("Correlation:")
display(countries_corr)

The correlation and covariance matrices look like this:
Covariance:


Unnamed: 0,People,Area,BIP
People,4569720000000000.0,56507320000000.0,13715570000.0
Area,56507320000000.0,24058600000000.0,-4706640000.0
BIP,13715570000.0,-4706640000.0,2125358.0


Correlation:


Unnamed: 0,People,Area,BIP
People,1.0,0.170422,0.139172
Area,0.170422,1.0,-0.658203
BIP,0.139172,-0.658203,1.0


In [10]:
print("The last 4 rows of the data frame:")
display(countries.tail(4))

The last 4 rows of the data frame:


Unnamed: 0,Name,People,Area,BIP,Currency
1,Japan,126045000,377835,4938,YEN
2,Canada,36503097,9984670,1529,CAD
3,Italy,60501718,301338,1850,EUR
4,Brazilia,208360000,8515770,1798,REAL


In [11]:
print("All the row of countries who have the EURO")
display(countries.loc[countries['Currency'] == "EUR"])

All the row of countries who have the EURO


Unnamed: 0,Name,People,Area,BIP,Currency
0,Germany,82521653,357385,3466,EUR
3,Italy,60501718,301338,1850,EUR


In [12]:
print("A new data frame with only name and Currency")
new_data_frame = countries[['Name', 'Currency']].copy()
display(new_data_frame)

A new data frame with only name and Currency


Unnamed: 0,Name,Currency
0,Germany,EUR
1,Japan,YEN
2,Canada,CAD
3,Italy,EUR
4,Brazilia,REAL


In [13]:
print("All the row of countries that have more than 2000 BIP")
display(countries.loc[countries['BIP'] >= 2000])

All the row of countries that have more than 2000 BIP


Unnamed: 0,Name,People,Area,BIP,Currency
0,Germany,82521653,357385,3466,EUR
1,Japan,126045000,377835,4938,YEN


In [14]:
print("All the row of countries with inhabitants between 50 and 150 Mio")
display(countries[countries['People'].between(50000000, 150000000)])

All the row of countries with inhabitants between 50 and 150 Mio


Unnamed: 0,Name,People,Area,BIP,Currency
0,Germany,82521653,357385,3466,EUR
1,Japan,126045000,377835,4938,YEN
3,Italy,60501718,301338,1850,EUR


In [18]:
print("Changing 'BIP' to 'Bip'")
countries=countries.rename(index=str, columns={'BIP': 'Bip'})
display(countries)

Changing 'BIP' to 'Bip'


Unnamed: 0,Name,People,Area,Bip,Currency
0,Germany,82521653,357385,3466,EUR
1,Japan,126045000,377835,4938,YEN
2,Canada,36503097,9984670,1529,CAD
3,Italy,60501718,301338,1850,EUR
4,Brazilia,208360000,8515770,1798,REAL


In [19]:
bip_sum=countries['Bip'].sum()
print("The Bip sum: " + str(bip_sum))

The Bip sum: 13581


In [26]:
people_average=countries['People'].mean()
print("The average people of all countries: " + str(people_average))

The average people of all countries: 102786293.6


In [27]:
print("Sorted by name alphabetically")
display(countries.sort_values(by=['Name']))

Unnamed: 0,Name,People,Area,Bip,Currency
4,Brazilia,208360000,8515770,1798,REAL
2,Canada,36503097,9984670,1529,CAD
0,Germany,82521653,357385,3466,EUR
3,Italy,60501718,301338,1850,EUR
1,Japan,126045000,377835,4938,YEN


In [32]:
countries.loc[countries['Area'] > 1000000, 'Area'] = 'BIG'
display(countries)

TypeError: '>' not supported between instances of 'str' and 'int'