In [20]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [26]:
# Read in our dataset
df = pd.read_csv('CO2_emissions_1990_2018.csv')
print(f"Total samples: {len(df)}")
df.head()

Total samples: 195


Unnamed: 0,Country,Data source,Sector,Gas,Unit,2018,2017,2016,2015,2014,...,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990
0,World,CAIT,Total including LUCF,CO2,MtCO₂e,36441.55,35588.7,35160.6,34521.91,34558.59,...,24957.3,24895.32,25292.92,24214.92,23890.22,23260.29,23124.0,22988.29,23056.4,22849.92
1,China,CAIT,Total including LUCF,CO2,MtCO₂e,9663.36,9367.67,9164.21,9120.27,9184.77,...,2799.84,2882.75,2779.27,2715.5,2735.48,2414.5,2294.12,2068.77,1952.78,1823.96
2,United States,CAIT,Total including LUCF,CO2,MtCO₂e,4749.57,4581.9,4656.84,4563.52,4683.35,...,5191.66,5172.06,5129.29,4864.46,4708.31,4654.52,4581.76,4461.62,4389.5,4426.4
3,European Union (27),CAIT,Total including LUCF,CO2,MtCO₂e,2636.99,2692.12,2669.54,2321.61,2263.78,...,3079.46,3134.03,3142.23,3210.93,3113.68,3060.09,3072.4,3133.06,3247.48,3286.44
4,India,CAIT,Total including LUCF,CO2,MtCO₂e,2400.25,2267.16,2149.01,2085.38,2072.03,...,683.0,618.73,600.38,555.6,519.98,466.79,431.31,409.09,386.17,341.32


In [27]:
# Drop the columns we don't need
df.drop(['Data source','Sector','Gas','Unit'],axis=1,inplace=True)

# Re-order the years from 1990-2018
df = df.iloc[:,::-1]
df = df.set_index(['Country'])

# Transpose the matrix (makes it so the years are the index instead of countries)
df = df.T

# This will set up the DataFrame so our index is in datetime format
df.reset_index(inplace=True)
df = df.rename_axis("Year", axis=1)
df = df.rename(columns={'index':'Year'})
df['Year'] = pd.to_datetime(df['Year'],infer_datetime_format=False)
df = df.set_index(['Year'])
df = df.rename_axis(None, axis=1)
df.drop(['European Union (27)'], axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,World,China,United States,India,Indonesia,Japan,Russia,Brazil,Germany,Iran,...,Micronesia,Liechtenstein,Cook Islands,Kiribati,Bhutan,Nauru,Tuvalu,Niue,Fiji,Ghana
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-01,22849.92,1823.96,4426.4,341.32,832.39,1016.89,1790.34,1236.73,904.38,158.83,...,0.0,0.22,0.02,0.02,-6.48,0.13,0.0,0.01,-1.81,12.67
1991-01-01,23056.4,1952.78,4389.5,386.17,846.23,1029.99,1766.89,1243.65,881.44,197.26,...,-0.03,0.23,0.02,0.02,-6.44,0.13,0.0,0.01,-1.81,12.44
1992-01-01,22988.29,2068.77,4461.62,409.09,855.3,1041.79,1630.98,1247.13,841.83,212.99,...,0.08,0.23,0.02,0.02,-6.41,0.12,0.01,0.01,-1.78,12.96
1993-01-01,23124.0,2294.12,4581.76,431.31,869.88,1033.99,1447.33,1253.64,836.08,210.86,...,0.08,0.24,0.02,0.03,-6.44,0.11,0.01,0.01,-1.79,13.07
1994-01-01,23260.29,2414.5,4654.52,466.79,881.31,1087.22,1238.41,1261.19,824.87,240.77,...,0.08,0.22,0.02,0.03,-6.42,0.11,0.01,0.01,-1.79,13.34


In [28]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column World has 0 null values
Column China has 0 null values
Column United States has 0 null values
Column India has 0 null values
Column Indonesia has 0 null values
Column Japan has 0 null values
Column Russia has 0 null values
Column Brazil has 0 null values
Column Germany has 0 null values
Column Iran has 0 null values
Column Canada has 0 null values
Column Democratic Republic of the Congo has 0 null values
Column South Korea has 0 null values
Column Saudi Arabia has 0 null values
Column Mexico has 0 null values
Column South Africa has 0 null values
Column Australia has 0 null values
Column Turkey has 0 null values
Column United Kingdom has 0 null values
Column Malaysia has 0 null values
Column Italy has 0 null values
Column Poland has 0 null values
Column Thailand has 0 null values
Column France has 0 null values
Column Egypt has 0 null values
Column Vietnam has 0 null values
Column Spain has 0 null values
Column Kazakhstan has 0 null values
Column Pakistan has 0 null values
Colum

In [29]:
# drop Namibia country row due to NaN value for 1990 data
df.drop(['Namibia'], axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,World,China,United States,India,Indonesia,Japan,Russia,Brazil,Germany,Iran,...,Micronesia,Liechtenstein,Cook Islands,Kiribati,Bhutan,Nauru,Tuvalu,Niue,Fiji,Ghana
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-01,22849.92,1823.96,4426.4,341.32,832.39,1016.89,1790.34,1236.73,904.38,158.83,...,0.0,0.22,0.02,0.02,-6.48,0.13,0.0,0.01,-1.81,12.67
1991-01-01,23056.4,1952.78,4389.5,386.17,846.23,1029.99,1766.89,1243.65,881.44,197.26,...,-0.03,0.23,0.02,0.02,-6.44,0.13,0.0,0.01,-1.81,12.44
1992-01-01,22988.29,2068.77,4461.62,409.09,855.3,1041.79,1630.98,1247.13,841.83,212.99,...,0.08,0.23,0.02,0.02,-6.41,0.12,0.01,0.01,-1.78,12.96
1993-01-01,23124.0,2294.12,4581.76,431.31,869.88,1033.99,1447.33,1253.64,836.08,210.86,...,0.08,0.24,0.02,0.03,-6.44,0.11,0.01,0.01,-1.79,13.07
1994-01-01,23260.29,2414.5,4654.52,466.79,881.31,1087.22,1238.41,1261.19,824.87,240.77,...,0.08,0.22,0.02,0.03,-6.42,0.11,0.01,0.01,-1.79,13.34


In [31]:
# Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


In [16]:
co2_with_index.head()

Unnamed: 0_level_0,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
China,9663.36,9367.67,9164.21,9120.27,9184.77,9226.51,8823.05,8568.09,8138.34,7382.89,...,2799.84,2882.75,2779.27,2715.5,2735.48,2414.5,2294.12,2068.77,1952.78,1823.96
United States,4749.57,4581.9,4656.84,4563.52,4683.35,4670.34,4531.16,4753.12,4990.96,4757.65,...,5191.66,5172.06,5129.29,4864.46,4708.31,4654.52,4581.76,4461.62,4389.5,4426.4
India,2400.25,2267.16,2149.01,2085.38,2072.03,1901.98,1843.74,1695.97,1670.29,1573.51,...,683.0,618.73,600.38,555.6,519.98,466.79,431.31,409.09,386.17,341.32
Indonesia,1269.55,1081.11,1068.1,1574.76,1539.23,1250.27,1303.87,1285.69,773.92,1036.71,...,849.42,907.96,1484.56,772.38,909.21,881.31,869.88,855.3,846.23,832.39
Japan,1074.08,1119.86,1139.33,1131.74,1170.11,1211.27,1202.66,1162.67,1057.96,1003.19,...,1090.14,1055.5,1098.26,1110.51,1097.1,1087.22,1033.99,1041.79,1029.99,1016.89


In [32]:
# Find na values
for column in df.columns:
    print(f"Column {column} has {df[column].isna().sum()} na values")

Column World has 0 na values
Column China has 0 na values
Column United States has 0 na values
Column India has 0 na values
Column Indonesia has 0 na values
Column Japan has 0 na values
Column Russia has 0 na values
Column Brazil has 0 na values
Column Germany has 0 na values
Column Iran has 0 na values
Column Canada has 0 na values
Column Democratic Republic of the Congo has 0 na values
Column South Korea has 0 na values
Column Saudi Arabia has 0 na values
Column Mexico has 0 na values
Column South Africa has 0 na values
Column Australia has 0 na values
Column Turkey has 0 na values
Column United Kingdom has 0 na values
Column Malaysia has 0 na values
Column Italy has 0 na values
Column Poland has 0 na values
Column Thailand has 0 na values
Column France has 0 na values
Column Egypt has 0 na values
Column Vietnam has 0 na values
Column Spain has 0 na values
Column Kazakhstan has 0 na values
Column Pakistan has 0 na values
Column Argentina has 0 na values
Column United Arab Emirates ha