NOM PRENOM de tous

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression

from statsmodels.tsa.arima.model import ARIMA

import plotly.express as px

In [11]:
# 1. LOADING
def load_data(file_path):
    """
    Input: file_path (str) – path to the CSV file
    Output: df (DataFrame) – loaded dataset
    """
    df = pd.read_csv(file_path)
    return df


# 2. EXPLORATION
def explore_data(df):
    """
    Prints structural information 
    and generates basic exploratory visualizations.
    """
    # Structure
    print("Dataset Shape:", df.shape)
    print("\nColumn Types:\n", df.dtypes)
    
    # Missing values
    print("\nMissing Values per Column:\n", df.isna().sum())
    
    # Basic statistics (only numeric columns)
    print("\nStatistical Summary:\n", df.describe())
    
    # Correlation matrix (if numeric data exists)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 1:
        plt.figure(figsize=(10, 6))
        sns.heatmap(df[numeric_cols].corr(), annot=False, cmap="coolwarm")
        plt.title("Correlation Heatmap")
        plt.show()
    else:
        print("\nNot enough numeric columns to compute correlations.")

In this step, we explore the structure of the dataset to understand its content. We examine:

- number of rows and columns,
- data types,
- missing values,
- numerical statistics,
- correlations between variables,
- basic visualizations (heatmap).

This information helps identify potential issues and guides the construction of the indicators in later steps.

In [25]:
# 3. INDICATORS
def indicator_groupby(df):
    """
    Indicator 1A:
        Top 10 emitting countries in 2021.
    
    Indicator 1B:
        Top 10 countries with the largest increase between 1990 and 2021.
    """

    # Indicator 1A – Top 10 emitting countries in 2021
    # Group by country and sum emissions for the year 2021
    country_emissions_2021 = (
        df.groupby("Country")["2021"]
        .sum()
        .sort_values(ascending=False)
        .head(10)
    )

    print("Indicator 1A – Top 10 emitting countries in 2021:")
    print(country_emissions_2021)

    plt.figure(figsize=(10, 6))
    country_emissions_2021.plot(kind="bar")
    plt.title("Top 10 Emitting Countries (2021)")
    plt.ylabel("Emissions")
    plt.xlabel("Country")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

    # Indicator 1B – Largest increase by country (1990 to 2021)
    # Select needed columns
    temp = df[["Country", "1990", "2021"]].copy()

    # Fill missing values
    temp[["1990", "2021"]] = temp[["1990", "2021"]].fillna(0)

    # Calculate the change
    temp["Change_1990_2021"] = temp["2021"] - temp["1990"]

    # Aggregate the change per country
    country_change = (
        temp.groupby("Country")["Change_1990_2021"]
        .sum()
        .sort_values(ascending=False)
        .head(10)
    )

    print("\nIndicator 1B – Top 10 countries with the largest increase in emissions (1990–2021):")
    print(country_change)

    plt.figure(figsize=(10, 6))
    country_change.plot(kind="bar")
    plt.title("Top 10 Countries – Increase in Emissions (1990–2021)")
    plt.ylabel("Increase in emissions (2021 – 1990)")
    plt.xlabel("Country")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
    pass

def indicator_transformation(df):
    """
    Indicator 2 – Data Transformation:
    K-Means clustering on standardized emission trends (1990–2021).
    """

    # Copy the data
    temp = df.copy()

    # Keep only country + year columns
    year_cols = [str(y) for y in range(1990, 2021+1)]
    temp = temp[["Country"] + year_cols]

    # Aggregate emissions by country (sum across gas/sector)
    country_data = temp.groupby("Country")[year_cols].sum()

    # Standardize emissions (important for clustering)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(country_data)

    # Apply K-means (3 clusters is a good start)
    kmeans = KMeans(n_clusters=3, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)

    # Add cluster labels
    country_data["Cluster"] = clusters

    print("Indicator 2 – K-means clustering results (first 10 countries):")
    print(country_data.head(10))

    # Visualize clusters using PCA (2D projection)
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    proj = pca.fit_transform(X_scaled)

    plt.figure(figsize=(10, 6))
    plt.scatter(proj[:, 0], proj[:, 1], c=clusters, cmap="viridis")
    for i, country in enumerate(country_data.index):
        plt.text(proj[i, 0], proj[i, 1], country, fontsize=8)
    plt.title("K-Means Clustering of Countries (PCA Projection)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.tight_layout()
    plt.show()

    return country_data

def indicator_temporal(df):
    """
    Indicator 3 – Temporal analysis:

    3A: ARIMA forecast of global emissions (sum over all countries) from 1990 to 2021,
        with forecast for the next 10 years.

    3B: Linear regression trend on the same time series, with prediction up to 2030.
    """

    # Prepare a global time series (sum over all countries)
    year_cols = [str(y) for y in range(1990, 2021 + 1)]

    # Sum emissions across all rows for each year
    ts = df[year_cols].sum(axis=0)

    # Convert index to integers (years)
    ts.index = ts.index.astype(int)

    # Sort by year just in case
    ts = ts.sort_index()

    # Handle missing values (forward/backward fill)
    ts = ts.fillna(method="ffill").fillna(method="bfill")

    print("Global emissions time series (first values):")
    print(ts.head())
    print("\nGlobal emissions time series (last values):")
    print(ts.tail())


    # Indicator 3A – ARIMA forecast (next 10 years)
    model = ARIMA(ts, order=(1, 1, 1))  # simple ARIMA model
    results = model.fit()

    forecast_steps = 10  # next 10 years
    forecast = results.get_forecast(steps=forecast_steps)
    forecast_mean = forecast.predicted_mean
    forecast_ci = forecast.conf_int()

    # Build future year index (e.g., 2022–2031)
    last_year = ts.index.max()
    future_years = np.arange(last_year + 1, last_year + 1 + forecast_steps)

    # Plot historical data + ARIMA forecast
    plt.figure(figsize=(10, 6))
    plt.plot(ts.index, ts.values, label="Historical emissions")
    plt.plot(future_years, forecast_mean.values, label="ARIMA forecast", linestyle="--")

    # Confidence intervals
    plt.fill_between(
        future_years,
        forecast_ci.iloc[:, 0],
        forecast_ci.iloc[:, 1],
        alpha=0.2,
        label="95% confidence interval"
    )

    plt.title("Indicator 3A – ARIMA Forecast of Global Emissions")
    plt.xlabel("Year")
    plt.ylabel("Emissions")
    plt.legend()
    plt.tight_layout()
    plt.show()


    # Indicator 3B – Linear regression trend + prediction
    # Prepare data for regression
    X = ts.index.values.reshape(-1, 1)   # years
    y = ts.values                        # emissions

    lin_reg = LinearRegression()
    lin_reg.fit(X, y)

    # Predict from 1990 to 2030
    years_full = np.arange(1990, 2031)
    y_pred = lin_reg.predict(years_full.reshape(-1, 1))

    plt.figure(figsize=(10, 6))
    # Historical data points
    plt.scatter(ts.index, ts.values, label="Historical emissions", s=30)
    # Regression line
    plt.plot(years_full, y_pred, label="Linear regression trend", linestyle="-")

    # Highlight future predictions (after last historical year)
    future_mask = years_full > last_year
    plt.plot(years_full[future_mask], y_pred[future_mask],
             label="Predicted (beyond historical data)", linestyle="--")

    plt.title("Indicator 3B – Linear Trend and Forecast of Global Emissions")
    plt.xlabel("Year")
    plt.ylabel("Emissions")
    plt.legend()
    plt.tight_layout()
    plt.show()
    pass


def indicator_spatial(df):
    """
    Indicator 4 – Spatial analysis.

    4A: Choropleth map of emissions by country in 2021.
    4B: Choropleth map of the change in emissions between 1990 and 2021.
    """

    # 4A – Emissions by country in 2021
    country_2021 = (
        df.groupby(["ISO", "Country"])["2021"]
        .sum()
        .reset_index()
    )

    fig_2021 = px.choropleth(
        country_2021,
        locations="ISO",            # ISO 3-letter country codes
        color="2021",
        hover_name="Country",
        color_continuous_scale="Reds",
        projection="natural earth",
        title="Indicator 4A – Emissions by Country (2021)"
    )
    fig_2021.show()

    # 4B – Change in emissions 1990 to 2021
    country_change = (
        df.groupby(["ISO", "Country"])[["1990", "2021"]]
        .sum()
        .reset_index()
    )
    country_change["Change_1990_2021"] = (
        country_change["2021"] - country_change["1990"]
    )

    fig_change = px.choropleth(
        country_change,
        locations="ISO",
        color="Change_1990_2021",
        hover_name="Country",
        color_continuous_scale="RdBu",
        projection="natural earth",
        title="Indicator 4B – Change in Emissions (1990–2021)"
    )
    fig_change.show()
    pass


In [4]:
# 4. DASHBOARD
def create_dashboard():
    """
    Creates and launches the Dash dashboard.
    """
    pass

In [None]:
# 5. MAIN FUNCTION
def main():
    file_path = "historical_emissions.csv"
    
    # Step 1: Load the data
    df = load_data(file_path)
    
    # Step 2: Explore the data
    # explore_data(df)
    
    # Step 3: Build the indicators
    indicator_groupby(df)
    indicator_transformation(df)
    indicator_temporal(df)
    indicator_spatial(df)
    
    # Step 4: Launch the dashboard
    create_dashboard()


if __name__ == "__main__":
    main()