In [5]:
# Importing libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [6]:
# Load the dataset
train_data = pd.read_csv(r'C:\Users\Syed Asad\OneDrive\Desktop\Python Files\Data Analysis\Kaggle Competitions\Predicting CO2 Emissions for Rwanda\train.csv')
train_data.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


The dataset contains 76 columns, including the target variable emission. Let's start by handling missing values and then training an initial Random Forest model to determine feature importance.

In [7]:
# Drop the ID column as it's likely not relevant for predictions
train_data = train_data.drop(columns=['ID_LAT_LON_YEAR_WEEK'])

In [8]:
# List of columns to drop
columns_to_drop = [
    'UvAerosolLayerHeight_aerosol_height',
    'UvAerosolLayerHeight_aerosol_pressure',
    'UvAerosolLayerHeight_aerosol_optical_depth',
    'UvAerosolLayerHeight_sensor_zenith_angle',
    'UvAerosolLayerHeight_sensor_azimuth_angle',
    'UvAerosolLayerHeight_solar_azimuth_angle',
    'UvAerosolLayerHeight_solar_zenith_angle'
]

# Drop the specified columns
train_data = train_data.drop(columns=columns_to_drop)

# Display the first few rows of the modified dataset
train_data.head()

Unnamed: 0,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,50.843559,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,39.137194,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,52.868816,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,-0.51,29.29,2019,3,,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,35.515587,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [9]:
# Handle missing values using median imputation
imputer = SimpleImputer(strategy='median')
train_data_imputed = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)


In [10]:
train_data_imputed.head()

Unnamed: 0,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,-0.51,29.29,2019.0,0.0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,50.843559,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,-0.51,29.29,2019.0,1.0,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,39.137194,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,-0.51,29.29,2019.0,2.0,0.000514,0.748199,0.000385,0.110018,72.795837,52.868816,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,-0.51,29.29,2019.0,3.0,2.4e-05,0.809118,1.9e-05,0.161855,-12.441726,37.784299,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,-0.51,29.29,2019.0,4.0,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,35.515587,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


Given the situation, I'll take the following steps intially to find the feature importance (Feature Engineering)

Use a 80% random sample of the dataset to make the computations more manageable.
Train the Random Forest model on this sample.
Evaluate feature importance based on this model

In [11]:
# Sample a fraction of the data to make computations more manageable
sample_fraction = 0.5  # 50% of the data
train_sample = train_data_imputed.sample(frac=sample_fraction, random_state=42)


In [12]:
# Split the sampled data into training and validation sets
X_sample = train_sample.drop(columns=['emission'])
y_sample = train_sample['emission']
X_train_sample, X_valid_sample, y_train_sample, y_valid_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

In [45]:
y_train_sample

76421      4.991847
11813     53.378754
53870    318.966500
36086    202.817950
68843     37.776096
            ...    
27545    158.708540
69720     83.778114
60992    132.458450
61148      2.930360
25248      0.140064
Name: emission, Length: 31609, dtype: float64

In [14]:
len(X_valid_sample)

7903

In [15]:
# Train a Random Forest model on the sampled data
rf_sample = RandomForestRegressor(n_estimators=100, random_state=42)
rf_sample.fit(X_train_sample, y_train_sample)

In [16]:
# Evaluate the model on the validation set
y_pred_sample = rf_sample.predict(X_valid_sample)
mse_sample = mean_squared_error(y_valid_sample, y_pred_sample)

mse_sample

529.826172645584

In [46]:
# Calculate the R2 score using the model.score() method
r2_score = rf_sample.score(X_valid_sample, y_valid_sample)
print(f'R2 Score: {r2_score}')

R2 Score: 0.9743069850087583


In [57]:
# Extract feature importances
feature_importances = rf_sample.feature_importances_


In [58]:
# Create a DataFrame for the importances and sort them in descending order
importance_df = pd.DataFrame({
    'Feature': X_train_sample.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

In [49]:
importance_df.head(10)  # Display the top 10 important features

Unnamed: 0,Feature,Importance
1,longitude,0.705643
0,latitude,0.170188
3,week_no,0.062218
47,Ozone_O3_column_number_density,0.004409
46,UvAerosolIndex_solar_zenith_angle,0.003644
13,CarbonMonoxide_CO_column_number_density,0.002739
55,Ozone_solar_zenith_angle,0.002323
62,Cloud_surface_albedo,0.00228
7,SulphurDioxide_cloud_fraction,0.002175
15,CarbonMonoxide_cloud_height,0.001735


In [59]:
# Load the test dataset
test_data = pd.read_csv(r'C:\Users\Syed Asad\OneDrive\Desktop\Python Files\Data Analysis\Kaggle Competitions\Predicting CO2 Emissions for Rwanda\test.csv')
test_data.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,ID_-0.510_29.290_2022_00,-0.51,29.29,2022,0,,,,,,...,36022.027344,8472.313477,41047.9375,7472.313477,7.935617,0.240773,-100.113792,33.697044,-133.047546,33.779583
1,ID_-0.510_29.290_2022_01,-0.51,29.29,2022,1,0.000456,0.691164,0.000316,0.0,76.239196,...,48539.737242,6476.147323,54915.708579,5476.147161,11.448437,0.293119,-30.510319,42.402593,-138.632822,31.01238
2,ID_-0.510_29.290_2022_02,-0.51,29.29,2022,2,0.000161,0.605107,0.000106,0.07987,-42.055341,...,34133.080469,8984.795703,39006.09375,7984.795703,10.753179,0.26713,39.087361,45.93648,-144.784988,26.743361
3,ID_-0.510_29.290_2022_03,-0.51,29.29,2022,3,0.00035,0.696917,0.000243,0.201028,72.169566,...,50854.991076,6014.724059,57646.368368,5014.724115,11.764556,0.304679,-24.465127,42.140419,-135.027891,29.604774
4,ID_-0.510_29.290_2022_04,-0.51,29.29,2022,4,-0.000317,0.580527,-0.000184,0.204352,76.190865,...,46594.685145,6849.280477,52896.541873,5849.280394,13.065317,0.284221,-12.90785,30.122641,-135.500119,26.276807


In [60]:
# Drop the ID column from the test dataset
test_data = test_data.drop(columns=['ID_LAT_LON_YEAR_WEEK'])

In [61]:
test_data.head()

Unnamed: 0,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,...,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,-0.51,29.29,2022,0,,,,,,,...,36022.027344,8472.313477,41047.9375,7472.313477,7.935617,0.240773,-100.113792,33.697044,-133.047546,33.779583
1,-0.51,29.29,2022,1,0.000456,0.691164,0.000316,0.0,76.239196,15.600607,...,48539.737242,6476.147323,54915.708579,5476.147161,11.448437,0.293119,-30.510319,42.402593,-138.632822,31.01238
2,-0.51,29.29,2022,2,0.000161,0.605107,0.000106,0.07987,-42.055341,39.88906,...,34133.080469,8984.795703,39006.09375,7984.795703,10.753179,0.26713,39.087361,45.93648,-144.784988,26.743361
3,-0.51,29.29,2022,3,0.00035,0.696917,0.000243,0.201028,72.169566,58.862543,...,50854.991076,6014.724059,57646.368368,5014.724115,11.764556,0.304679,-24.465127,42.140419,-135.027891,29.604774
4,-0.51,29.29,2022,4,-0.000317,0.580527,-0.000184,0.204352,76.190865,15.646016,...,46594.685145,6849.280477,52896.541873,5849.280394,13.065317,0.284221,-12.90785,30.122641,-135.500119,26.276807


In [62]:
# List of columns to drop from the test dataset
columns_to_drop_test = [
    'UvAerosolLayerHeight_aerosol_height',
    'UvAerosolLayerHeight_aerosol_pressure',
    'UvAerosolLayerHeight_aerosol_optical_depth',
    'UvAerosolLayerHeight_sensor_zenith_angle',
    'UvAerosolLayerHeight_sensor_azimuth_angle',
    'UvAerosolLayerHeight_solar_azimuth_angle',
    'UvAerosolLayerHeight_solar_zenith_angle'
]

In [65]:
# Drop the specified columns from the test dataset
test_data_filtered = test_data.drop(columns=columns_to_drop_test)

In [70]:
# Train the imputer again using only predictor features from the training dataset
imputer_predictors = SimpleImputer(strategy='median')
imputer_predictors.fit(train_data.drop(columns=['emission']))


In [73]:
# Impute missing values in the test dataset using the new imputer
test_data_imputed = pd.DataFrame(imputer_predictors.transform(test_data_filtered), columns=test_data_filtered.columns)

In [74]:
# Confirm the preprocessing
test_data_imputed.head()

Unnamed: 0,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,...,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,-0.51,29.29,2022.0,0.0,2.4e-05,0.809118,1.9e-05,0.161855,-12.441726,37.784299,...,36022.027344,8472.313477,41047.9375,7472.313477,7.935617,0.240773,-100.113792,33.697044,-133.047546,33.779583
1,-0.51,29.29,2022.0,1.0,0.000456,0.691164,0.000316,0.0,76.239196,15.600607,...,48539.737242,6476.147323,54915.708579,5476.147161,11.448437,0.293119,-30.510319,42.402593,-138.632822,31.01238
2,-0.51,29.29,2022.0,2.0,0.000161,0.605107,0.000106,0.07987,-42.055341,39.88906,...,34133.080469,8984.795703,39006.09375,7984.795703,10.753179,0.26713,39.087361,45.93648,-144.784988,26.743361
3,-0.51,29.29,2022.0,3.0,0.00035,0.696917,0.000243,0.201028,72.169566,58.862543,...,50854.991076,6014.724059,57646.368368,5014.724115,11.764556,0.304679,-24.465127,42.140419,-135.027891,29.604774
4,-0.51,29.29,2022.0,4.0,-0.000317,0.580527,-0.000184,0.204352,76.190865,15.646016,...,46594.685145,6849.280477,52896.541873,5849.280394,13.065317,0.284221,-12.90785,30.122641,-135.500119,26.276807


In [76]:
# Extract the top features for model training
top_features = importance_df['Feature'].head(10).tolist()

In [78]:
# Train the Random Forest model on the entire training data using top features
X_train_full = train_data_imputed[top_features]
y_train_full = train_data_imputed['emission']

rf_full = RandomForestRegressor(n_estimators=100, random_state=42)
rf_full.fit(X_train_full, y_train_full)


In [82]:
# Define X_test using the top features from the imputed test data
X_test = test_data_imputed[top_features]

In [84]:
# Predict carbon emissions for the test dataset using the trained model
test_predictions_larger_sample = rf_full.predict(X_test)

In [85]:
test_predictions_larger_sample[:10]  # Display the first 10 predictions

array([3.9803963 , 3.75772807, 4.18508902, 4.04335469, 4.01025285,
       4.22304842, 4.24440294, 4.10648914, 4.08243369, 3.85170554])

In [86]:
# Load the sample submission file
sample_submission = pd.read_csv(r'C:\Users\Syed Asad\OneDrive\Desktop\Python Files\Data Analysis\Kaggle Competitions\Predicting CO2 Emissions for Rwanda\sample_submission.csv')

# Display the first few rows of the sample submission
sample_submission.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-0.510_29.290_2022_00,81.94
1,ID_-0.510_29.290_2022_01,81.94
2,ID_-0.510_29.290_2022_02,81.94
3,ID_-0.510_29.290_2022_03,81.94
4,ID_-0.510_29.290_2022_04,81.94


In [87]:
# Load the original test dataset to extract the ID_LAT_LON_YEAR_WEEK column
test_original = pd.read_csv(r'C:\Users\Syed Asad\OneDrive\Desktop\Python Files\Data Analysis\Kaggle Competitions\Predicting CO2 Emissions for Rwanda\test.csv')

In [88]:
# Create the final submission dataframe
final_submission = pd.DataFrame({
    'ID_LAT_LON_YEAR_WEEK': test_original['ID_LAT_LON_YEAR_WEEK'],
    'emission': test_predictions_larger_sample
})

In [96]:
# Specify the file path for the final submission CSV file
submission_file_path = 'final_submission.csv'

In [97]:
# Write the final submission DataFrame to a CSV file
final_submission.to_csv(submission_file_path, index=False)