1. Loading Libraries 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

2. Loading Dataset

In [None]:
df = pd.read_csv("emissions.csv")
df.head()

3. Data Wrangling

In [None]:
df.isnull().sum()

In [None]:
# Checking outliers in the "value" column using interquartile range (IQR)
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the DataFrame to remove outliers
cleaned_data = df[(df['value'] >= lower_bound) & (df['value'] <= upper_bound)].copy()

# Convert "year" to datetime format
cleaned_data.loc[:, 'year'] = pd.to_datetime(cleaned_data['year'], format='%Y')

# Verify the cleaned dataset
info = cleaned_data.info()
description = cleaned_data.describe()

(info, description)


4. EDA

In [None]:
sns.set(style="whitegrid")
# 1. Total emissions trend over time
total_emissions = cleaned_data.groupby('year')['value'].sum()
plt.figure(figsize=(8, 4))
sns.lineplot(x=total_emissions.index, y=total_emissions.values, marker='o', color='blue')
plt.title("Total Carbon Dioxide Emissions Over Time", fontsize=16)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Total Emissions (Million Metric Tons)", fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()


In [None]:
# Top 5 states with highest total emissions
top_states = cleaned_data.groupby('state-name')['value'].sum().sort_values(ascending=False).head(5)

plt.figure(figsize=(8, 4))

# Use palette without specifying hue
sns.barplot(x=top_states.index, y=top_states.values, palette='viridis')

plt.title("Top 5 States by Total Emissions", fontsize=16)
plt.xlabel("State", fontsize=12)
plt.ylabel("Total Emissions (Million Metric Tons)", fontsize=12)
plt.show()


In [None]:
# Sector contributions for the top-emitting state
top_state = top_states.index[0]
state_sector = cleaned_data[cleaned_data['state-name'] == top_state].groupby('sector-name')['value'].sum()
plt.figure(figsize=(8, 4))
sns.barplot(x=state_sector.index, y=state_sector.values, palette='magma')
plt.title(f"Sector Contributions in {top_state}", fontsize=16)
plt.xlabel("Sector", fontsize=12)
plt.ylabel("Total Emissions (Million Metric Tons)", fontsize=12)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Fuel type contributions across all states
fuel_contributions = cleaned_data.groupby('fuel-name')['value'].sum()
plt.figure(figsize=(8, 4))
sns.barplot(x=fuel_contributions.index, y=fuel_contributions.values, palette='coolwarm')
plt.title("Emissions by Fuel Type", fontsize=16)
plt.xlabel("Fuel Type", fontsize=12)
plt.ylabel("Total Emissions (Million Metric Tons)", fontsize=12)
plt.show()

5. Predictive Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Encode categorical variables using one-hot encoding
encoded_data = pd.get_dummies(cleaned_data, columns=['state-name', 'sector-name', 'fuel-name'], drop_first=True)

# Features (X) and target (y)
X = encoded_data.drop(['value', 'year'], axis=1)
y = encoded_data['value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

rmse, mae, r2
