# DATAFRAMES with Pandas

In [1]:
import pandas as pd
import numpy as np


We'll keep our analysis of G7 countries and looking now at DataFrames. As said, a DataFrame looks a lot like a table.
Creating DataFrames manually can be tedious. 99% of the time you'll be pulling the data from a Database, a csv file or the web. But still, you can create a DataFrame by specifying the columns and values

In [2]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [3]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


A DataFrame column will be a pandas series. So we can think of a Dataframe as a combination of series

In [4]:
df.index=[
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States'
]
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [5]:
df.info() #gives all information 

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [6]:
print(df.size,'\n',df.shape)

35 
 (7, 5)


In [7]:
df.describe() #gives summary of statistics of numerical columns

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [8]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [9]:
df.dtypes.value_counts()

int64      2
float64    2
object     1
dtype: int64

## INDEXING, SLICING AND SELECTION:
remember that each column is represented as a Series.

In [10]:
df.loc['Canada'] #selection by index!

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [11]:
df.iloc[-2] #United Kingdom # works with the numeric position

Population       64.511
GDP             2950039
Surface Area     242495
HDI               0.907
Continent        Europe
Name: United Kingdom, dtype: object

In [12]:
df['Population'] #accessing a certain column

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [13]:
df['GDP']

Canada             1785387
France             2833687
Germany            3874437
Italy              2167744
Japan              4602367
United Kingdom     2950039
United States     17348075
Name: GDP, dtype: int64

In [14]:
df['Population'].to_frame() #converts series into dataframe

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


Slicing works at row level

In [15]:
df[1:3] #row level selection-> prefer using loc and iloc

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [16]:
df[['Population','Surface Area']]

Unnamed: 0,Population,Surface Area
Canada,35.467,9984670
France,63.951,640679
Germany,80.94,357114
Italy,60.665,301336
Japan,127.061,377930
United Kingdom,64.511,242495
United States,318.523,9525067


In [17]:
df.loc['France':'Japan','Surface Area']

France     640679
Germany    357114
Italy      301336
Japan      377930
Name: Surface Area, dtype: int64

In [18]:
df.iloc[[0,1,-2]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe


In [19]:
df.iloc[1:3,[1,2]] #row wise, dataframe ignores the upper limit

Unnamed: 0,GDP,Surface Area
France,2833687,640679
Germany,3874437,357114


## CONDITIONAL SELECTION

In [20]:
df['Population']>70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [21]:
df.loc[df['Population']>70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [22]:
df.loc[df['Population']>70,'Population'] # will give us the population, but now in a Series format adding a <.to_frame()> will give it to us 
                                        # in data frame format.

Germany           80.940
Japan            127.061
United States    318.523
Name: Population, dtype: float64

## DROPPING STUFF

In [23]:
df.drop('Canada') #can drop multiple as well by df.drop(['Canada','Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [24]:
df.drop(columns=['Population','HDI']) #we can also use axis like we used in numpy, but it'll be very confusing

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


## Operations with Series
working at a column level

In [25]:
crisis=pd.Series([-1000000, -0.3], index=['GDP','HDI'])

In [26]:
df[['GDP','HDI']]+crisis #gets subtracted from all!

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
Germany,2874437.0,0.616
Italy,1167744.0,0.573
Japan,3602367.0,0.591
United Kingdom,1950039.0,0.607
United States,16348075.0,0.615


## MODIYING DATAFRAMES:

In [27]:
#ADDING A NEW COLUMN:
langs=pd.Series(['French','German','Italian'], index=['France','Germany','Italy'], name="Language")

In [28]:
df['Language']=langs
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


### REPLACING VALUES PER COLUMN:

In [29]:
df['Language']='English' #all will get affected
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


### RENAMING COLUMNS:

In [30]:
#Again remember that a new dataframe is created and the original one is never changed unless we do some thing like df=...
df.rename(
    columns={
        'HDI' : 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    },
    index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    }
)
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


In [31]:
df.rename(index=str.upper)  #making everything into capitals

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
CANADA,35.467,1785387,9984670,0.913,America,English
FRANCE,63.951,2833687,640679,0.888,Europe,English
GERMANY,80.94,3874437,357114,0.916,Europe,English
ITALY,60.665,2167744,301336,0.873,Europe,English
JAPAN,127.061,4602367,377930,0.891,Asia,English
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,English
UNITED STATES,318.523,17348075,9525067,0.915,America,English


In [32]:
df.rename(index=lambda x: x.lower())  #using lambda function to make everything shorthand

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
canada,35.467,1785387,9984670,0.913,America,English
france,63.951,2833687,640679,0.888,Europe,English
germany,80.94,3874437,357114,0.916,Europe,English
italy,60.665,2167744,301336,0.873,Europe,English
japan,127.061,4602367,377930,0.891,Asia,English
united kingdom,64.511,2950039,242495,0.907,Europe,English
united states,318.523,17348075,9525067,0.915,America,English


### Adding Values:


In [33]:
df.append(pd.Series({
    'Population': 3,
    'GDP': 5
}, name='China'))


Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387.0,9984670.0,0.913,America,English
France,63.951,2833687.0,640679.0,0.888,Europe,English
Germany,80.94,3874437.0,357114.0,0.916,Europe,English
Italy,60.665,2167744.0,301336.0,0.873,Europe,English
Japan,127.061,4602367.0,377930.0,0.891,Asia,English
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe,English
United States,318.523,17348075.0,9525067.0,0.915,America,English
China,3.0,5.0,,,,


In [34]:
df.loc['China']=pd.Series({
    'Population': 1400.00,
    'Continent': 'Asia',
    'GDP': 17000000,
    'HDI': 0.889,
    'Language':'Chinese',
    'Surface Area': 100000000
})
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English
China,1400.0,17000000,100000000,0.889,Asia,Chinese


In [35]:
df.reset_index()

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent,Language
0,Canada,35.467,1785387,9984670,0.913,America,English
1,France,63.951,2833687,640679,0.888,Europe,English
2,Germany,80.94,3874437,357114,0.916,Europe,English
3,Italy,60.665,2167744,301336,0.873,Europe,English
4,Japan,127.061,4602367,377930,0.891,Asia,English
5,United Kingdom,64.511,2950039,242495,0.907,Europe,English
6,United States,318.523,17348075,9525067,0.915,America,English
7,China,1400.0,17000000,100000000,0.889,Asia,Chinese


In [36]:
df['GDP per capita']=df['GDP']/df['Population']
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,English,36221.712406
United Kingdom,64.511,2950039,242495,0.907,Europe,English,45729.239975
United States,318.523,17348075,9525067,0.915,America,English,54464.12033
China,1400.0,17000000,100000000,0.889,Asia,Chinese,12142.857143


## STATISTICAL INFO

In [37]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,English,36221.712406


In [38]:
population=df['Population']

In [39]:
population  #extracted a series

Canada              35.467
France              63.951
Germany             80.940
Italy               60.665
Japan              127.061
United Kingdom      64.511
United States      318.523
China             1400.000
Name: Population, dtype: float64

In [40]:
population.min(), population.max() #and others like std, mean,etc can also be performed

(35.467, 1400.0)