In [1]:
# Importing dependencies
import pandas as pd

In [2]:
# Importing source file
co2 = '../resources/annual-co2-emissions-per-country.csv'
co2_df = pd.read_csv(co2)
co2_df.head()

Unnamed: 0,Entity,Code,Year,Annual CO2 emissions
0,Afghanistan,AFG,1949,14656
1,Afghanistan,AFG,1950,84272
2,Afghanistan,AFG,1951,91600
3,Afghanistan,AFG,1952,91600
4,Afghanistan,AFG,1953,106256


In [3]:
# NaN values per column
co2_df.isna().sum()

Entity                     0
Code                    3371
Year                       0
Annual CO2 emissions       0
dtype: int64

In [4]:
# We will be working with data from 1970 to 2020. Filtering for desired years
co2_df= co2_df[(co2_df['Year'] >=1970) & (co2_df['Year'] <=2020)]

In [5]:
# NaN values per column after filter
co2_df.isna().sum()

Entity                    0
Code                    892
Year                      0
Annual CO2 emissions      0
dtype: int64

In [6]:
# Data frame information
co2_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11772 entries, 21 to 24669
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Entity                11772 non-null  object
 1   Code                  10880 non-null  object
 2   Year                  11772 non-null  int64 
 3   Annual CO2 emissions  11772 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 459.8+ KB


In [7]:
# List of entity names with missing Code
missing_code = co2_df[co2_df['Code'].isna()]
list_missing_code = missing_code['Entity'].unique().tolist()
list_missing_code
# Entities with missing code are not countries. Not necessary for our project

['Africa',
 'Asia',
 'Asia (excl. China & India)',
 'Europe',
 'Europe (excl. EU-27)',
 'Europe (excl. EU-28)',
 'European Union (27)',
 'European Union (28)',
 'High-income countries',
 'International transport',
 'Kuwaiti Oil Fires',
 'Low-income countries',
 'Lower-middle-income countries',
 'North America',
 'North America (excl. USA)',
 'Oceania',
 'Panama Canal Zone',
 'Ryukyu Islands',
 'South America',
 'St. Kitts-Nevis-Anguilla',
 'Upper-middle-income countries']

In [8]:
# Removing rows with null in Code to keep only Countries
co2_df_c = co2_df.dropna(subset=['Code'])

In [9]:
co2_df_c.isna().sum()
co2_df_c.head()

Unnamed: 0,Entity,Code,Year,Annual CO2 emissions
21,Afghanistan,AFG,1970,1670397
22,Afghanistan,AFG,1971,1893554
23,Afghanistan,AFG,1972,1530347
24,Afghanistan,AFG,1973,1635454
25,Afghanistan,AFG,1974,1913152


In [10]:
# Saving our clean dataframe to a new CSV file
co2_df_c.to_csv('../csv/clean_co2_emissions.csv')