In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

In [None]:
# Load datasets and normalize column names (lowercase, stripped)
co2 = pd.read_csv('../data/co2_emission_by_country.csv')
energy = pd.read_csv('../data/energy_mix.csv')
microsoft = pd.read_csv('../data/microsoft_emissions.csv')

co2.columns = co2.columns.str.strip().str.lower()
energy.columns = energy.columns.str.strip().str.lower()
microsoft.columns = microsoft.columns.str.strip().str.lower()

print("Loaded datasets. CO2 columns:", co2.columns.tolist())

In [None]:
# Ensure we have the core columns we need ('year' and 'co2').
required_cols = {'year','co2'}
if not required_cols.issubset(set(co2.columns)):
    missing = required_cols - set(co2.columns)
    raise ValueError(f"Required columns missing from CO2 dataset: {missing}.\nPlease check the CSV headers.")

co2 = co2.dropna(subset=['year','co2'])
co2['year'] = co2['year'].astype(int)
co2_recent = co2[co2['year'] >= 2000]

global_trend = co2_recent.groupby('year')['co2'].sum().reset_index()
global_trend.head()

In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(data=global_trend, x='year', y='co2', color='forestgreen', linewidth=2)
plt.title('Global CO₂ Emission Trend (2000–2024)')
plt.xlabel('Year')
plt.ylabel('CO₂ Emissions (million tonnes)')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
latest = co2_recent['year'].max()
if 'country' not in co2_recent.columns:
    raise ValueError("Column 'country' not found in CO2 dataset.")

top_emitters = co2_recent[co2_recent['year'] == latest].nlargest(10, 'co2')
plt.figure(figsize=(10,5))
sns.barplot(data=top_emitters, x='country', y='co2', palette='Reds_r')
plt.title(f'Top 10 CO₂ Emitting Countries in {latest}')
plt.ylabel('CO₂ Emissions (million tonnes)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Microsoft plot - handle possible column name variants
ms_cols = set(microsoft.columns)
year_col = None
for c in ['year','date']:
    if c in ms_cols:
        year_col = c
        break

em_col = None
for c in ['emissions','emission','co2','co2_emissions']:
    if c in ms_cols:
        em_col = c
        break

ind_col = None
for c in ['industry_avg','industry_average','industry_avg_emissions']:
    if c in ms_cols:
        ind_col = c
        break

if year_col and em_col:
    plt.figure(figsize=(8,4))
    if ind_col:
        plt.plot(microsoft[year_col], microsoft[em_col], marker='o', label='Microsoft', color='blue')
        plt.plot(microsoft[year_col], microsoft[ind_col], linestyle='--', label='Tech Industry Avg', color='orange')
    else:
        plt.plot(microsoft[year_col], microsoft[em_col], marker='o', label='Microsoft', color='blue')
    plt.title('Microsoft vs Tech Industry CO₂ Emissions')
    plt.xlabel('Year')
    plt.ylabel('Emissions (million tonnes)')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()
else:
    print('Microsoft dataset missing expected columns. Found:', microsoft.columns.tolist())

In [None]:
# Prophet forecast (will check availability and handle numpy compatibility)
try:
    from prophet import Prophet
except Exception as e:
    print("Prophet import failed:", e)
    print("If Prophet isn't installed, run: pip install prophet cmdstanpy")
else:
    # Prepare dataframe for Prophet
    df_prophet = global_trend.rename(columns={'year':'ds','co2':'y'})
    df_prophet['ds'] = pd.to_datetime(df_prophet['ds'], format='%Y', errors='coerce')
    if df_prophet['ds'].isna().any():
        # try converting integer years to datetime (end of year)
        df_prophet['ds'] = df_prophet['ds'].fillna(pd.to_datetime(df_prophet['ds'].dt.year.astype(str) + '-12-31', errors='coerce')) if 'ds' in df_prophet else pd.to_datetime(df_prophet['ds'], errors='coerce')
    # Fit and forecast
    try:
        model = Prophet()
        model.fit(df_prophet)
        future = model.make_future_dataframe(periods=6, freq='Y')
        forecast = model.predict(future)
        fig = model.plot(forecast)
        plt.title('Forecast: Global CO₂ Emissions (2025–2030)')
        plt.xlabel('Year')
        plt.ylabel('Predicted CO₂ Emissions')
        plt.show()
        display(forecast[['ds','yhat','yhat_lower','yhat_upper']].tail(6))
    except Exception as e:
        print('Forecasting failed:', e)

In [None]:
# Regional trends if continent exists (case-insensitive)
continent_col = None
for c in co2.columns:
    if c.lower() == 'continent':
        continent_col = c
        break

if continent_col:
    region = co2_recent.groupby(['year', continent_col])['co2'].sum().reset_index()
    plt.figure(figsize=(10,5))
    sns.lineplot(data=region, x='year', y='co2', hue=continent_col)
    plt.title('Regional CO₂ Emission Trends')
    plt.ylabel('CO₂ Emissions (million tonnes)')
    plt.show()
else:
    print("No 'continent' column found in CO2 dataset. Skipping regional plot.")

In [None]:
print('Key Observations:')
print('- Global CO2 trend plotted from 2000 onwards.')
print('- Top emitters for the latest available year shown.')
print('- Forecast attempted if Prophet is installed.')

df_plot = pd.DataFrame({
    'country': ['India','USA','China','Germany'],
    'co2': [2100,5000,9000,1000]
})
try:
    import plotly.express as px
    fig = px.bar(df_plot, x='country', y='co2', title='CO2 Emissions by Country (sample)')
    fig.show()
except Exception:
    print('Plotly not available - skip interactive plot.')

In [None]:
os.makedirs('../visuals', exist_ok=True)
os.makedirs('../reports', exist_ok=True)

latest_year = co2['year'].max()
top_emitters = co2[co2['year'] == latest_year].nlargest(10, 'co2')[['country','co2']]

plt.figure(figsize=(10,6))
plt.barh(top_emitters['country'], top_emitters['co2'], color='skyblue')
plt.xlabel('CO₂ Emissions (Million Tonnes)')
plt.ylabel('Country')
plt.title(f'Top 10 CO₂ Emitters in {latest_year}')
plt.gca().invert_yaxis()
plt.savefig('../visuals/top_10_emitters.png', bbox_inches='tight')
plt.close()

summary = co2.groupby('country')['co2'].sum().reset_index().sort_values(by='co2', ascending=False).head(10)
summary.to_csv('../reports/top_10_emitters_report.csv', index=False)

print('Saved visuals and reports to ../visuals and ../reports')