# Prediction of CO2 Emissions in Rwanda

## Project from Kaggle

https://www.kaggle.com/competitions/playground-series-s3e20/overview

The ability to accurately monitor carbon emissions is a critical step in the fight against climate change. Precise carbon readings allow researchers and governments to understand the sources and patterns of carbon mass output. While Europe and North America have extensive systems in place to monitor carbon emissions on the ground, there are few available in Africa.

The objective of this challenge is to create a machine learning models using open-source CO2 emissions data from Sentinel-5P satellite observations to predict future carbon emissions.

These solutions may help enable governments, and other actors to estimate carbon emission levels across Africa, even in places where on-the-ground monitoring is not possible.

### Importing all libraries

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import *
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn.metrics import *
from sklearn.discriminant_analysis import *
from sklearn.feature_selection import *
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import *

### Importing and preprocessing training data 

In [16]:
df_train = pd.read_csv("train.csv")
df_train_na = df_train.isna().sum()
for i in df_train:
    if df_train_na[i] > 0:
        avg = df_train[i].median()
        df_train[i] = df_train[i].fillna(avg)

col = list(df_train.columns.values[1:3])
col += list(df_train.columns.values[5:75])

df_train_pre = df_train.copy()

ss = StandardScaler()
scale = ss.fit(df_train[col])

g = pd.DataFrame(scale.transform(df_train[col]),columns=col)
df_train_pre[col] = g

### Importing and preprocessing testing data 

In [17]:
df_test = pd.read_csv("test.csv")
df_test_na = df_test.isna().sum()
for i in df_test:
    if df_test_na[i] > 0:
        avg = df_test[i].median()
        df_test[i] = df_test[i].fillna(avg)

col = list(df_test.columns.values[1:3])
col += list(df_test.columns.values[5:75])

df_test_pre = df_test.copy()

ss = StandardScaler()
scale = ss.fit(df_test[col])

g = pd.DataFrame(scale.transform(df_test[col]),columns=col)
df_test_pre[col] = g

### Preprocessed training data

In [18]:
df_train_pre.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,1.988534,-0.728253,2019,0,-0.617429,-1.354297,-0.52414,1.499215,-1.547623,...,-1.353704,0.184638,-1.516525,-0.262503,0.01672,-0.060518,-0.750348,-1.378504,0.643142,3.750994
1,ID_-0.510_29.290_2019_01,1.988534,-0.728253,2019,1,-0.093107,-0.607616,-0.099088,-0.435488,0.436785,...,-1.363005,0.836687,-1.10367,-0.771638,-0.297013,1.359108,-0.137864,-1.548103,-0.154131,4.025176
2,ID_-0.510_29.290_2019_02,1.988534,-0.728253,2019,2,1.915273,-0.488422,1.8989,-0.76089,1.405035,...,-0.965708,0.071939,-0.8515,0.147214,-0.413264,0.864367,-1.566521,-1.47747,-0.395221,4.231381
3,ID_-0.510_29.290_2019_03,1.988534,-0.728253,2019,3,-0.08022,-0.125087,-0.072069,0.043473,-0.063415,...,-0.255427,-0.925955,-0.360982,-0.275948,-0.191248,-0.016413,-2.506101,-1.216246,0.205146,4.305286
4,ID_-0.510_29.290_2019_04,1.988534,-0.728253,2019,4,-0.497103,-0.917262,-0.428799,-0.587935,0.221928,...,-1.131698,0.480018,-0.969997,-0.81427,-0.722791,1.629005,-0.475633,-1.450702,-1.303718,4.347317


### Preprocessed testing data

In [19]:
df_test_pre.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,ID_-0.510_29.290_2022_00,1.988534,-0.728253,2022,0,-0.025672,-0.150017,-0.024796,-0.002374,0.097206,...,-1.926239,1.899041,-1.961927,1.910157,-0.838383,-0.328377,-2.70031,-1.0192,-1.425006,1.307123
1,ID_-0.510_29.290_2022_01,1.988534,-0.728253,2022,1,2.195286,-0.644598,2.095942,-2.015056,1.118246,...,-0.444017,0.453279,-0.439019,0.441641,-0.551341,0.798469,-0.468734,0.369579,-1.582048,0.682613
2,ID_-0.510_29.290_2022_02,1.988534,-0.728253,2022,2,0.735284,-1.234576,0.667438,-0.858553,-0.915789,...,-2.149909,2.270216,-2.186155,2.287173,-0.608153,0.239004,1.762656,0.933333,-1.755028,-0.28083
3,ID_-0.510_29.290_2022_03,1.988534,-0.728253,2022,3,1.671615,-0.605161,1.60244,0.895806,1.04827,...,-0.169868,0.119084,-0.139147,0.102187,-0.52551,1.047326,-0.274918,0.327755,-1.480688,0.364941
4,ID_-0.510_29.290_2022_04,1.988534,-0.728253,2022,4,-1.625046,-1.403085,-1.306322,0.943939,1.117415,...,-0.674331,0.723528,-0.660756,0.716143,-0.419222,0.606925,0.095623,-1.589418,-1.493965,-0.386123


### Calculation of p values and eliminating non-significant features

In [20]:
X = df_train_pre.drop(["emission","ID_LAT_LON_YEAR_WEEK"],axis=1,inplace=False)
y = df_train_pre["emission"]
f,p = f_regression(X,y)
p_values = pd.DataFrame(p,index=X.columns.values)
p_values = p_values.round(2)
p_values = p_values.reset_index()
no_p = list(p_values[p_values[0]>0.05]["index"])
no_p1 = no_p.copy()
p_values[p_values[0]<=0.05].head()

Unnamed: 0,index,0
0,latitude,0.0
1,longitude,0.0
2,year,0.0
3,week_no,0.05
4,SulphurDioxide_SO2_column_number_density,0.0


### Creation of training and testing data

In [21]:
X_train = X.drop(no_p,axis=1,inplace=False)
y_train = y.copy()
no_p1.insert(0,"ID_LAT_LON_YEAR_WEEK")
X_test = df_test_pre.drop(no_p1,axis=1,inplace=False)

### Performance of Regression Analysis using Lasso Regression

In [22]:
la = Lasso()
model = la.fit(X_train,y_train)
y_test = model.predict(X_test)
y_test = np.round(y_test,2)
y_test

array([ 58.22,  65.62,  50.57, ...,  93.11, 101.51,  85.46])

### Final submission dataset

In [23]:
final_df = pd.DataFrame(columns=["ID_LAT_LON_YEAR_WEEK","emission"])
final_df["ID_LAT_LON_YEAR_WEEK"] = df_test["ID_LAT_LON_YEAR_WEEK"]
final_df["emission"] = y_test

In [24]:
final_df.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-0.510_29.290_2022_00,58.22
1,ID_-0.510_29.290_2022_01,65.62
2,ID_-0.510_29.290_2022_02,50.57
3,ID_-0.510_29.290_2022_03,58.82
4,ID_-0.510_29.290_2022_04,77.6


### Exporting of final dataset to CSV file

In [25]:
final_df.to_csv("amith_submission_1.csv",index=False)