In [1]:
import numpy as np
import pandas as pd 

# Pandas series

In [2]:
g7_pop = pd.Series([35.647,63.951,80.940,60.665,127.061,64.511,318.523])

In [3]:
g7_pop

0     35.647
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
dtype: float64

In [4]:
#adding a name to the series:

g7_pop.name = 'G7 Population in millions'

In [5]:
g7_pop

0     35.647
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

In [6]:
g7_pop.dtype

dtype('float64')

In [7]:
#it´s like a numpy array
g7_pop.values

array([ 35.647,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

They look like numpy arrays, but they are more similar to dictionaries.
A series has an index

In [8]:
g7_pop[0]

35.647

In [9]:
g7_pop[1]

63.951

defining the index:

In [10]:
g7_pop.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'UK',
    'USA'
]

In [11]:
g7_pop

Canada      35.647
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
USA        318.523
Name: G7 Population in millions, dtype: float64

In [12]:
pd.Series({
    'Canada': 35.467,
    'France': 63.951,
    'Germany': 80.94,
    'Italy': 60.665,
    'Japan': 127.061,
    'United Kingdom': 64.511,
    'United States': 318.523
}, name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [13]:
pd.Series(
    [35.467, 63.951, 80.94, 60.665, 127.061, 64.511, 318.523],
    index=['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
    name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [14]:
pd.Series(g7_pop, index=['France', 'Germany','Italy','UK'])

France     63.951
Germany    80.940
Italy      60.665
UK         64.511
Name: G7 Population in millions, dtype: float64

## Indexing

They work similarly to lists and dictionaries, you use the index of the element you´re looking for:

In [15]:
g7_pop['Canada']

35.647

For numeric positions we use the iloc attribute:

In [16]:
g7_pop.iloc[0]

35.647

In [17]:
g7_pop.iloc[1]

63.951

In [18]:
g7_pop.iloc[-1]

318.523

Selecting multiple elements at once:

In [19]:
g7_pop[['Italy','UK']]

Italy    60.665
UK       64.511
Name: G7 Population in millions, dtype: float64

In [20]:
g7_pop.iloc[[0,-1]]

Canada     35.647
USA       318.523
Name: G7 Population in millions, dtype: float64

In [21]:
g7_pop['Canada':'Italy']

Canada     35.647
France     63.951
Germany    80.940
Italy      60.665
Name: G7 Population in millions, dtype: float64

# Conditional selection (boolean arrays)

The same boolean arrat techniques we saw applied to numpy arrays can be used for pandas series:

In [22]:
g7_pop

Canada      35.647
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
USA        318.523
Name: G7 Population in millions, dtype: float64

In [23]:
g7_pop > 70

Canada     False
France     False
Germany     True
Italy      False
Japan       True
UK         False
USA         True
Name: G7 Population in millions, dtype: bool

In [24]:
g7_pop[g7_pop > 70]

Germany     80.940
Japan      127.061
USA        318.523
Name: G7 Population in millions, dtype: float64

In [25]:
g7_pop.mean()

107.32828571428571

In [26]:
g7_pop[g7_pop > g7_pop.mean()]

Japan    127.061
USA      318.523
Name: G7 Population in millions, dtype: float64

In [27]:
g7_pop.std()

97.22783106825997

In [28]:
g7_pop[(g7_pop > g7_pop.mean() - g7_pop.std() / 2) | (g7_pop > g7_pop.mean() + g7_pop.std() / 2)]

France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
USA        318.523
Name: G7 Population in millions, dtype: float64

# Operation and methods

In [29]:
g7_pop

Canada      35.647
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
USA        318.523
Name: G7 Population in millions, dtype: float64

In [30]:
g7_pop * 1000000

Canada      35647000.0
France      63951000.0
Germany     80940000.0
Italy       60665000.0
Japan      127061000.0
UK          64511000.0
USA        318523000.0
Name: G7 Population in millions, dtype: float64

In [31]:
g7_pop.mean()

107.32828571428571

In [32]:
np.log(g7_pop)

Canada     3.573665
France     4.158117
Germany    4.393708
Italy      4.105367
Japan      4.844667
UK         4.166836
USA        5.763695
Name: G7 Population in millions, dtype: float64

## Modifying series

In [33]:
g7_pop['Canada'] = 40.5

In [34]:
g7_pop

Canada      40.500
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
USA        318.523
Name: G7 Population in millions, dtype: float64

In [35]:
g7_pop.iloc[-1] = 500

In [36]:
g7_pop

Canada      40.500
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
USA        500.000
Name: G7 Population in millions, dtype: float64

In [37]:
g7_pop[g7_pop < 70] = 70

In [38]:
g7_pop

Canada      70.000
France      70.000
Germany     80.940
Italy       70.000
Japan      127.061
UK          70.000
USA        500.000
Name: G7 Population in millions, dtype: float64

# Dataframes

In [39]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [40]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


DataFrames also have indexes. As you can see in the "table" above, pandas has assigned a numeric, autoincremental index automatically to each "row" in our DataFrame. In our case, we know that each row represents a country, so we'll just reassign the index:

In [41]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [42]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [43]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [44]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [46]:
df.size

35

In [47]:
df.shape

(7, 5)

In [48]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [49]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [50]:
df.dtypes.value_counts()

int64      2
float64    2
object     1
dtype: int64

## Indexing, selection and slicing

Individual columns in the DataFrame can be selected with regular indexing. Each column is represented as a Series:

In [51]:
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [52]:
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [53]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


row level: it´s recommended to use loc and iloc than [:]

In [54]:
df[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [55]:
df.loc['Italy'].to_frame()

Unnamed: 0,Italy
Population,60.665
GDP,2167744
Surface Area,301336
HDI,0.873
Continent,Europe


In [56]:
df.iloc[1].to_frame()

Unnamed: 0,France
Population,63.951
GDP,2833687
Surface Area,640679
HDI,0.888
Continent,Europe


In [57]:
df.loc['France':'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [58]:
df.loc['France':'Italy', 'Population']

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

In [59]:
df.loc['France':'Italy', ['Population', 'GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


#### iloc

In [60]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [61]:
df.iloc[0]

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [62]:
df.iloc[[0,1,-1]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United States,318.523,17348075,9525067,0.915,America


In [63]:
df.iloc[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [64]:
df.iloc[1:3,3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [65]:
df.iloc[1:3,[0,3]]

Unnamed: 0,Population,HDI
France,63.951,0.888
Germany,80.94,0.916


In [66]:
df.iloc[1:3, 1:3]

Unnamed: 0,GDP,Surface Area
France,2833687,640679
Germany,3874437,357114


## Conditional selection (boolean arrays)

In [67]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


It works like Series

In [68]:
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [69]:
df[df['Population']>70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [71]:
df.loc[df['Population']>70, 'Population']

Germany           80.940
Japan            127.061
United States    318.523
Name: Population, dtype: float64

In [72]:
df.loc[df['Population']>70, ['Population','GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


## Dropping stuff

In [73]:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [74]:
df


Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [75]:
df.drop(['Canada', 'Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [76]:
df.drop(columns=['Population', 'HDI'])

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [77]:
df.drop(['Italy','Canada'], axis=0)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [78]:
df.drop(['Population', 'GDP'], axis=1)

Unnamed: 0,Surface Area,HDI,Continent
Canada,9984670,0.913,America
France,640679,0.888,Europe
Germany,357114,0.916,Europe
Italy,301336,0.873,Europe
Japan,377930,0.891,Asia
United Kingdom,242495,0.907,Europe
United States,9525067,0.915,America


## Operations

In [79]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [80]:
df[['Population', 'GDP']] /100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


mixing operations with series

In [81]:
crisis = pd.Series([-1000000,-0.3], index=['GDP','HDI'])

In [82]:
crisis

GDP   -1000000.0
HDI         -0.3
dtype: float64

In [83]:
df[['GDP', 'HDI']]

Unnamed: 0,GDP,HDI
Canada,1785387,0.913
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


In [84]:
df[['GDP', 'HDI']] + crisis

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
Germany,2874437.0,0.616
Italy,1167744.0,0.573
Japan,3602367.0,0.591
United Kingdom,1950039.0,0.607
United States,16348075.0,0.615


## Modifying dataframes

In [85]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)

In [86]:
langs


France      French
Germany     German
Italy      Italian
Name: Language, dtype: object

In [87]:
df['Language'] = langs

In [88]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


### Replacing values per column

In [89]:
df['Language'] = 'english'

In [90]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,english
France,63.951,2833687,640679,0.888,Europe,english
Germany,80.94,3874437,357114,0.916,Europe,english
Italy,60.665,2167744,301336,0.873,Europe,english
Japan,127.061,4602367,377930,0.891,Asia,english
United Kingdom,64.511,2950039,242495,0.907,Europe,english
United States,318.523,17348075,9525067,0.915,America,english


### Renaming columns

In [91]:
df.rename(
    columns={
        'HDI':'Human Development Index',
        'Anual Popcorn Consumption':'APC'
    },
    index={
        'United Kingdom':'UK',
        'United States':'USA',
        'Argentina':'AR'
    }
)

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,english
France,63.951,2833687,640679,0.888,Europe,english
Germany,80.94,3874437,357114,0.916,Europe,english
Italy,60.665,2167744,301336,0.873,Europe,english
Japan,127.061,4602367,377930,0.891,Asia,english
UK,64.511,2950039,242495,0.907,Europe,english
USA,318.523,17348075,9525067,0.915,America,english


In [92]:
df.rename(index=str.upper)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
CANADA,35.467,1785387,9984670,0.913,America,english
FRANCE,63.951,2833687,640679,0.888,Europe,english
GERMANY,80.94,3874437,357114,0.916,Europe,english
ITALY,60.665,2167744,301336,0.873,Europe,english
JAPAN,127.061,4602367,377930,0.891,Asia,english
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,english
UNITED STATES,318.523,17348075,9525067,0.915,America,english


In [93]:
df.rename(index=lambda x:x.lower())

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
canada,35.467,1785387,9984670,0.913,America,english
france,63.951,2833687,640679,0.888,Europe,english
germany,80.94,3874437,357114,0.916,Europe,english
italy,60.665,2167744,301336,0.873,Europe,english
japan,127.061,4602367,377930,0.891,Asia,english
united kingdom,64.511,2950039,242495,0.907,Europe,english
united states,318.523,17348075,9525067,0.915,America,english


### Dropping columns

In [94]:
df.drop(columns='Language', inplace=True)

In [95]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


### Adding values

In [101]:
df.append(
    pd.Series({
        'Population':3,
        'GDP':5
    },
    name='China')
)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America
China,3.0,5.0,,,


In [103]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America


In [104]:
df.loc['China'] = pd.Series({'Population':2000000000, 'Continent':'Asia'})

In [105]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America
China,2000000000.0,,,,Asia


### Radical index changes

In [106]:
df.reset_index()

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,Canada,35.467,1785387.0,9984670.0,0.913,America
1,France,63.951,2833687.0,640679.0,0.888,Europe
2,Germany,80.94,3874437.0,357114.0,0.916,Europe
3,Italy,60.665,2167744.0,301336.0,0.873,Europe
4,Japan,127.061,4602367.0,377930.0,0.891,Asia
5,United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
6,United States,318.523,17348075.0,9525067.0,0.915,America
7,China,2000000000.0,,,,Asia


In [107]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America
China,2000000000.0,,,,Asia


In [108]:
df.set_index('Population')

Unnamed: 0_level_0,GDP,Surface Area,HDI,Continent
Population,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35.467,1785387.0,9984670.0,0.913,America
63.951,2833687.0,640679.0,0.888,Europe
80.94,3874437.0,357114.0,0.916,Europe
60.665,2167744.0,301336.0,0.873,Europe
127.061,4602367.0,377930.0,0.891,Asia
64.511,2950039.0,242495.0,0.907,Europe
318.523,17348075.0,9525067.0,0.915,America
2000000000.0,,,,Asia


In [109]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America
China,2000000000.0,,,,Asia


### Creating columns from other columns

In [110]:
df[['Population','GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387.0
France,63.951,2833687.0
Germany,80.94,3874437.0
Italy,60.665,2167744.0
Japan,127.061,4602367.0
United Kingdom,64.511,2950039.0
United States,318.523,17348075.0
China,2000000000.0,


In [111]:
df['GDP'] / df['Population']

Canada            50339.385908
France            44310.284437
Germany           47868.013343
Italy             35733.025633
Japan             36221.712406
United Kingdom    45729.239975
United States     54464.120330
China                      NaN
dtype: float64

In [112]:
df['GDP per capita'] = df['GDP'] / df['Population']

In [113]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per capita
Canada,35.467,1785387.0,9984670.0,0.913,America,50339.385908
France,63.951,2833687.0,640679.0,0.888,Europe,44310.284437
Germany,80.94,3874437.0,357114.0,0.916,Europe,47868.013343
Italy,60.665,2167744.0,301336.0,0.873,Europe,35733.025633
Japan,127.061,4602367.0,377930.0,0.891,Asia,36221.712406
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe,45729.239975
United States,318.523,17348075.0,9525067.0,0.915,America,54464.12033
China,2000000000.0,,,,Asia,


### Statistical info

In [114]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per capita
Canada,35.467,1785387.0,9984670.0,0.913,America,50339.385908
France,63.951,2833687.0,640679.0,0.888,Europe,44310.284437
Germany,80.94,3874437.0,357114.0,0.916,Europe,47868.013343
Italy,60.665,2167744.0,301336.0,0.873,Europe,35733.025633
Japan,127.061,4602367.0,377930.0,0.891,Asia,36221.712406


In [115]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI,GDP per capita
count,8.0,7.0,7.0,7.0,7.0
mean,250000100.0,5080248.0,3061327.0,0.900429,44952.254576
std,707106700.0,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,63.1295,2500716.0,329225.0,0.8895,40265.998421
50%,72.7255,2950039.0,377930.0,0.907,45729.239975
75%,174.9265,4238402.0,5082873.0,0.914,49103.699626
max,2000000000.0,17348080.0,9984670.0,0.916,54464.12033


In [116]:
population = df['Population']

In [117]:
population

Canada            3.546700e+01
France            6.395100e+01
Germany           8.094000e+01
Italy             6.066500e+01
Japan             1.270610e+02
United Kingdom    6.451100e+01
United States     3.185230e+02
China             2.000000e+09
Name: Population, dtype: float64

In [118]:
population.min()

35.467

In [119]:
population.max()

2000000000.0

In [120]:
population.sum()

2000000751.118

In [121]:
population.std()

707106743.2493652

In [122]:
population.median()

72.7255

In [123]:
population.quantile(0.25)

63.1295