In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

Group of Seven populations Series.

In [60]:
g7_population = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
# an ordered series of elements that uses a numpy array for the structure type
g7_population.name = 'G7 Population in millions'
type(g7_population.values)

numpy.ndarray

In [61]:
# However, you can change the index for the series
g7_population.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]
g7_population

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [62]:
# the series can be thought of as an ordered dictionary, and they can be created from a dictionary too
pd.Series({
    'Canada': 35.467,
    'France': 63.951,
    'Germany': 80.94,
    'Italy': 60.665,
    'Japan': 127.061,
    'United Kingdom': 64.511,
    'United States': 318.523
}, name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [63]:
# you can still use numbers for the index if you use the iloc for the series.
g7_population.iloc[0]

35.467

In [64]:
g7_population['Canada']

35.467

In [65]:
g7_population.iloc[-1]

318.523

In [66]:
g7_population['United States']

318.523

In [67]:
g7_population['Canada': 'Italy']
# you can do a range like with python, except that the upper limit will be included with pandas

Canada     35.467
France     63.951
Germany    80.940
Italy      60.665
Name: G7 Population in millions, dtype: float64

In [68]:
g7_population > 70
# can do boolean series too just like the numpy arrays

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: G7 Population in millions, dtype: bool

In [69]:
g7_population[g7_population > 70]
# example of actually printing out the series

Germany           80.940
Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [70]:
g7_population[g7_population > g7_population.mean()]

Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

The series supports vectorized options performed by numpy as the series is structurally formed by a numpy array

In [71]:
g7_population * 1_000_000

Canada             35467000.0
France             63951000.0
Germany            80940000.0
Italy              60665000.0
Japan             127061000.0
United Kingdom     64511000.0
United States     318523000.0
Name: G7 Population in millions, dtype: float64

In [72]:
g7_population.iloc[-1] = 500
g7_population

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     500.000
Name: G7 Population in millions, dtype: float64

In [73]:
g7_population[g7_population < 70] = 99.99
g7_population

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     500.000
Name: G7 Population in millions, dtype: float64

## Dataframes:  

You can create a dataframe by specifiying the columns, rows, and values however this can very easily become very very tedious.  
It is far more likely to create a dataframe using a csv or db file through pandas.

In [74]:
population_df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])
# the columns assignment is optional but helps keep the order

In [75]:
population_df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]
# can create the indexes to have values and not just numbers

In [76]:
population_df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [77]:
population_df.info()
# shows the info for the dataframe and can be helpful in finding null values

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [78]:
population_df.describe()
# similar to numpy 2 dimensional array

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [79]:
population_df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [80]:
population_df.loc['Canada']
# selects a row with loc, can use iloc for an index

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [81]:
population_df['Population'].to_frame()
# selectes a column, and the to_frame function will cause it to look more like a dataframe

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [82]:
population_df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [83]:
population_df[1:4]
# while you can slice the dataframe in this manner it can be confusing as the upper limit is not included.
# easier to use loc for row selections though especially with unique indexes
# and then iloc can be used if needing to just use numerical indexes, but the upper limit will be included so there is less confusion

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [84]:
population_df.loc['France': 'Italy']


Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [85]:
population_df.loc['France': 'Italy', ['Population', 'GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [86]:
population_df.loc[population_df['Population'] > 70]
# can use a boolean selector just like with series and numpy arrays

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [87]:
population_df.loc[population_df['Population'] > 70, ['Population', 'GDP']]


Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


In [88]:
population_df.drop('Canada')
# can use drop to exclude rows instead of asking for inclusion like before
# drop does not actually change the dataframe though, so the data is still there just not being displayed

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [89]:
population_df.drop(columns=['Population', 'HDI'])

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [90]:
population_crisis = pd.Series([-1_000_000, -0.3], index=['GDP', 'HDI'])
# you can do math on the dataframes using a series and it will go down the column

In [91]:
population_df[['GDP', 'HDI']]

Unnamed: 0,GDP,HDI
Canada,1785387,0.913
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


In [92]:
population_df[['GDP', 'HDI']] + population_crisis

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
Germany,2874437.0,0.616
Italy,1167744.0,0.573
Japan,3602367.0,0.591
United Kingdom,1950039.0,0.607
United States,16348075.0,0.615


In [93]:
langs = pd.Series(
    ['French', 'German', 'Italian', 'English'],
    index=['France', 'Germany', 'Italy', 'United States'],
    name='Language'
)
# creating a series to add into the dataframe

In [94]:
population_df['Language'] = langs
population_df
# creates NaN values for areas without data from the series.

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,English


In [95]:
population_df.rename(
    columns={
        'HDI': 'Human Development Index'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK'
    })
# renaming the columns and indexes for the dataframe, any extras that do not exist will cause no changes
# these changes are only for this display, unless you set the dataframe equal to it

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
UK,64.511,2950039,242495,0.907,Europe,
USA,318.523,17348075,9525067,0.915,America,English


In [96]:
population_df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,English


In [97]:
population_df.loc['China'] = pd.Series({'Population': 1_400_000_000, 'Continent': 'Asia', 'GDP': 5})
# to actually create a new row you need to set the dataframe row equal to the Series, as the .append function will create a temporary one
# or you can set it equal when you use the .append function

In [98]:
population_df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670.0,0.913,America,
France,63.951,2833687,640679.0,0.888,Europe,French
Germany,80.94,3874437,357114.0,0.916,Europe,German
Italy,60.665,2167744,301336.0,0.873,Europe,Italian
Japan,127.061,4602367,377930.0,0.891,Asia,
United Kingdom,64.511,2950039,242495.0,0.907,Europe,
United States,318.523,17348075,9525067.0,0.915,America,English
China,1400000000.0,5,,,Asia,


In [100]:
population_df['GDP Per Capita'] = population_df['GDP'] / population_df['Population']
# can use the data in the columns to be able to create another column
population_df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP Per Capita
Canada,35.467,1785387,9984670.0,0.913,America,,50339.39
France,63.951,2833687,640679.0,0.888,Europe,French,44310.28
Germany,80.94,3874437,357114.0,0.916,Europe,German,47868.01
Italy,60.665,2167744,301336.0,0.873,Europe,Italian,35733.03
Japan,127.061,4602367,377930.0,0.891,Asia,,36221.71
United Kingdom,64.511,2950039,242495.0,0.907,Europe,,45729.24
United States,318.523,17348075,9525067.0,0.915,America,English,54464.12
China,1400000000.0,5,,,Asia,,3.571429e-09


## Reading External Data: