### Step 1: Load & Preprocess Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Loading data
df = pd.read_csv("D:\Carbon-Emissions-Dashboard\dataset\coal_train_data.csv")

In [3]:
df

Unnamed: 0,coaltype,gcv,burntamount,plf,production,totalemission
0,anthracite,4139.64,76.31,51.85,3468,2647902.63
1,bituminous,3377.47,383.92,51.85,3468,2647902.63
2,lignite,4080.50,185.83,73.94,2797,2181449.04
3,lignite,4577.27,321.20,73.94,2797,2181449.04
4,anthracite,4645.92,384.16,80.82,1989,1496548.92
...,...,...,...,...,...,...
89,bituminous,5058.67,230.91,77.23,3842,3099651.13
90,anthracite,4518.99,164.44,77.23,3842,3099651.13
91,anthracite,5521.30,196.52,57.31,1138,937609.37
92,bituminous,5140.67,104.17,57.31,1138,937609.37


In [3]:
# Defining Features(independent variables) & Target(dependent variables)
X = df[['coaltype', 'gcv', 'burntamount', 'plf', 'production']]
y = df['totalemission']  # Target variable

In [4]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Preprocessing (One-Hot Encoding for 'coaltype' + Scaling for numeric features)
num_features = ['gcv', 'burntamount', 'plf', 'production']
cat_features = ['coaltype']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

### Step 2: Train Multiple Models
We'll compare: 
- Linear Regression – Simple baseline
- Random Forest – Handles non-linearity well
- XGBoost – Powerful boosting model
- LSTM (Optional) – If treating this as a time-series problem

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [7]:
models = {
    "Linear Regression": Pipeline([("preprocessor", preprocessor), ("model", LinearRegression())]),
    "Random Forest": Pipeline([("preprocessor", preprocessor), ("model", RandomForestRegressor(n_estimators=100, random_state=42))]),
    "XGBoost": Pipeline([("preprocessor", preprocessor), ("model", xgb.XGBRegressor(n_estimators=100, random_state=42))])
}

In [8]:
# Train & Evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate Metrics
    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2 Score": r2_score(y_test, y_pred)
    }

In [9]:
# Display Results
results_df = pd.DataFrame(results).T
print(results_df)

                            MAE           MSE           RMSE  R2 Score
Linear Regression  43562.849049  2.900913e+09   53860.127869  0.997489
Random Forest      70484.984263  1.056201e+10  102771.652607  0.990859
XGBoost            70871.834539  1.316182e+10  114724.981794  0.988608
