# Pandas Recap

----------------------------

### Imports

#### Import pandas and matplotlib

In [1]:
import pandas as pd
from matplotlib import pyplot as plt

#### Create a DataFrame from a two-dimensional list

In [114]:
data = [[82_000_000, 1.9, "Europe"],
      [ 5_500_000, 1.8, "Europe"]]

# the column names can be 'population', 'fertility', 'continent'


df = pd.DataFrame(data, columns = ['population', 'fertility', 'continent'], 
                                   index = ['Germany', 'Denmark'])
df
# or, you can make a different list and create a dataframe out of that

Unnamed: 0,population,fertility,continent
Germany,82000000,1.9,Europe
Denmark,5500000,1.8,Europe


#### Create a DataFrame from a dictionary

In [119]:
data2 ={'spices' : ['basil', 'peper', 'mustardseed', 'paprika'],
       'values' : [1.2,2.4,9.3,3],
       'in stock' :[True, True, False, False]}

df2= pd.DataFrame(data2)
df2

Unnamed: 0,spices,values,in stock
0,basil,1.2,True
1,peper,2.4,True
2,mustardseed,9.3,False
3,paprika,3.0,False


---------------------------

### Reading from a CSV file


- Load data file large_countries_2015 and set the parameter index_col=0

In [2]:
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
df = pd.read_csv('large_countries_2015.csv', index_col=0)
df

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Japan,126573500.0,1.45,Asia
Mexico,127017200.0,2.13,North America
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia


-------------------------------

+ Use the info() and  methods to inspect the data types and non-null values

In [18]:
df.info

<bound method DataFrame.info of                  population  fertility      continent
country                                              
Bangladesh     1.609956e+08       2.12           Asia
Brazil         2.078475e+08       1.78  South America
China          1.376049e+09       1.57           Asia
India          1.311051e+09       2.43           Asia
Indonesia      2.575638e+08       2.28           Asia
Japan          1.265735e+08       1.45           Asia
Mexico         1.270172e+08       2.13  North America
Nigeria        1.822020e+08       5.89         Africa
Pakistan       1.889249e+08       3.04           Asia
Philippines    1.006994e+08       2.98           Asia
Russia         1.434569e+08       1.61         Europe
United States  3.217736e+08       1.97  North America>

+ How do I see which columns do I have

In [19]:
df.columns

Index(['population', 'fertility', 'continent'], dtype='object')

+ Inspect row labels/index

In [20]:
df.index

Index(['Bangladesh', 'Brazil', 'China', 'India', 'Indonesia', 'Japan',
       'Mexico', 'Nigeria', 'Pakistan', 'Philippines', 'Russia',
       'United States'],
      dtype='object', name='country')

- Find out the total number of rows and columns in the DataFrame.

In [21]:
df.shape

(12, 3)

+ Display the last 2 rows of the DataFrame.

In [22]:
df.tail(2)

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Russia,143456918.0,1.61,Europe
United States,321773631.0,1.97,North America


+ Display first two rows of the DataFrame

In [23]:
df.head(2)

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,160995642.0,2.12,Asia
Brazil,207847528.0,1.78,South America


* Check all the informations for country Brazil

In [24]:
df.loc["Brazil"]

population      207847528.0
fertility              1.78
continent     South America
Name: Brazil, dtype: object

* Select a population column

In [25]:
df['population']

country
Bangladesh       1.609956e+08
Brazil           2.078475e+08
China            1.376049e+09
India            1.311051e+09
Indonesia        2.575638e+08
Japan            1.265735e+08
Mexico           1.270172e+08
Nigeria          1.822020e+08
Pakistan         1.889249e+08
Philippines      1.006994e+08
Russia           1.434569e+08
United States    3.217736e+08
Name: population, dtype: float64

+ Call the columns fertility, population and continent

In [26]:
df[['population', 'fertility', 'continent']] # here we need double sqazre bracats becauase we have multiple tags




Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Japan,126573500.0,1.45,Asia
Mexico,127017200.0,2.13,North America
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia


+ Pull up the informations for China, Japan and Russia (use .loc function as we are looking for labels here)


In [27]:
df.loc[["China", "Japan", "Russia"]]

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,1376049000.0,1.57,Asia
Japan,126573500.0,1.45,Asia
Russia,143456900.0,1.61,Europe


+ Check the value of fertility for Indonesia

In [28]:
df.loc["Indonesia", "fertility"] # the first iteam in the list is calling the rown, the second is calling the column 

# df.loc(row, column)


2.28

+ Use iloc to select first 3 rows

In [29]:
df.iloc[0:3] # .iloc does not include the first value

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia


+ Select first 5 rows, fertility and continent columns.

In [30]:
df.iloc[0:5, 1:] # as we start at column #1, we delete column #0 which is population



Unnamed: 0_level_0,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangladesh,2.12,Asia
Brazil,1.78,South America
China,1.57,Asia
India,2.43,Asia
Indonesia,2.28,Asia


+ Select countries that have fertility greater than 1.3

In [37]:
mask = df.loc[df['fertility'] > 1.9] # don't forget to define a boolean mask
mask

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,160995600.0,2.12,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Mexico,127017200.0,2.13,North America
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia
United States,321773600.0,1.97,North America


+ Select the countries that have fertility values between 1.9 and 2.12

In [42]:
mask1 = df['fertility'].between(1.9, 2.12)


In [43]:
df[mask1]

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,160995642.0,2.12,Asia
United States,321773631.0,1.97,North America


+ Sort the DataFrame by the "fertility" column in descending order using the sort_values() method.

In [48]:
df.sort_values(by = "fertility", ascending = False)

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Mexico,127017200.0,2.13,North America
Bangladesh,160995600.0,2.12,Asia
United States,321773600.0,1.97,North America
Brazil,207847500.0,1.78,South America
Russia,143456900.0,1.61,Europe


+ Get the unique values for the continent column

+ Check which countries have the population greater than 200 000 000?

In [51]:
df.loc[df['population'] > 200_000_000]

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
United States,321773600.0,1.97,North America


+ Create a bar chart that shows the count of each "population" category in the DataFrame.

In [57]:
# if you want to quickly check your data, you can use this plot function

yearly_births.plot()

NameError: name 'yearly_births' is not defined

---------------------------------------------------------------