In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input,Dense,LSTM,Dropout
from tensorflow.keras.losses import MeanSquaredError,MeanAbsoluteError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping

## you could delete this line here (os one)
- i needed it cause im using wsl for tensorflow, but in normal windows, its possible to remove it 

In [61]:
import os
os.chdir('/tf-acno-projects/Project-Data-Mining')

In [62]:
def X_y_forecasting_splits(Datafile,time_steps):
    X,y = list(),list()
    for start in range(len(Datafile)):
        end = start+time_steps 
        if end>len(Datafile)-1:
            break
        X.append(Datafile.iloc[start:end].values)
        y.append(Datafile.iloc[end]["CO2 Emission"])
    return np.array(X),np.array(y)

In [63]:
def months_converter(DataFile):
    unique_months = DataFile['Month'].unique()
    months_dict = {
        month:idx+1 for idx,month in enumerate(unique_months)
    }
    DataFile['Month'] = DataFile['Month'].map(months_dict)

In [64]:
def lstm_architecture(INPUT_SHAPE,LR):
    input_layer = Input(shape=INPUT_SHAPE)
    hidden_layer = LSTM(64)(input_layer)
    hidden_layer = Dense(32,activation='relu')(hidden_layer)
    hidden_layer = Dense(16,activation='relu')(hidden_layer)
    output_layer = Dense(1,activation='linear')(hidden_layer)

    lstm_model = Model(input_layer,output_layer)
    #lstm_model.summary()
    lstm_model.compile(optimizer=Adam(learning_rate=LR),loss=MeanSquaredError(),metrics=[MeanAbsoluteError()])
    return lstm_model

In [65]:
DataFile = pd.read_csv("Emission.csv")
DataFile.head()

Unnamed: 0,Year-Month,CO2 Emission
0,1973-Jan,106.363
1,1973-Feb,101.76
2,1973-Mar,110.553
3,1973-Apr,104.734
4,1973-May,114.897


In [66]:
print(DataFile.isnull().sum())

Year-Month      0
CO2 Emission    0
dtype: int64


In [67]:
print(DataFile.duplicated().sum())

0


Alright, there are no null values and no duplicates but there is something wrong with the "Year-Month" column, it's better to split it into two and convert them to numerical values

In [68]:
DataFile[['Year', 'Month']] = DataFile['Year-Month'].str.split('-', expand=True)

DataFile.drop(columns=['Year-Month'], inplace=True)

months_converter(DataFile)

for col in DataFile.columns:
    DataFile[col] = pd.to_numeric(DataFile[col],errors='coerce')
print(DataFile)

     CO2 Emission  Year  Month
0         106.363  1973      1
1         101.760  1973      2
2         110.553  1973      3
3         104.734  1973      4
4         114.897  1973      5
..            ...   ...    ...
481       134.243  2013      2
482       153.078  2013      3
483       149.442  2013      4
484       156.356  2013      5
485       152.814  2013      6

[486 rows x 3 columns]


In [None]:
DataFile['CO2 Emission'].min()

In [None]:
DataFile['CO2 Emission'].max()

In [None]:
DataFile['CO2 Emission'].mean()

In [None]:
DataFile['CO2 Emission'].std()

this is a note that we should scale the data later on for our models

Now we need to perform visual analysis on our dataset, but first we need to create a csv of our new dataset

In [None]:
DataFile.to_csv("New Emission.csv", index=False)

In [None]:
DataFile = pd.read_csv("New Emission.csv")

DataFile.head()

In [None]:
print(DataFile.dtypes)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(DataFile["Year"], DataFile["CO2 Emission"], marker="o", linestyle="-", color="b")

# Labels and Title
plt.xlabel("Year")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission Over the Years")
plt.grid(True)

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=DataFile["Month"], y=DataFile["CO2 Emission"], palette="coolwarm")

# Labels and Title
plt.xlabel("Month")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Month")

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.barplot(x=DataFile["Year"], y=DataFile["CO2 Emission"], palette="coolwarm")

# Labels and Title
plt.xlabel("Year")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Year")

plt.xticks(rotation=45, ha="right")

# Show the plot
plt.show()

It's kind of a complex figure so we will group the years into ranges

In [None]:
min_year = DataFile["Year"].min()
max_year = DataFile["Year"].max()

print(min_year)
print(max_year)

In [None]:
bins = [1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015]

labels = ["1971-1975", "1976-1980", "1981-1985", "1986-1990", "1991-1995", "1996-2000", "2001-2005", "2006-2010", "2011-2015"]

DataFile["Year Range"] = pd.cut(DataFile["Year"], bins=bins, labels=labels, right=True)
print(DataFile[["Year", "Year Range"]].head())

In [None]:
DataFile.head()

In [None]:
plt.figure(figsize=(12, 5))  # Increase width
sns.barplot(x=DataFile["Year Range"], y=DataFile["CO2 Emission"], palette="coolwarm")

plt.xlabel("Year Range")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Year")

plt.xticks(rotation=45, ha="right")  # Rotate labels for better spacing

plt.show()


In [None]:
DataFile.drop(columns=['Year Range'], inplace=True)

In [None]:
DataFile.head()

## LSTM AND TRANSFORMERS


We have 486 rows so :
- train 80% = int(len(DataFile)*0.8)+1 => 389
- test 10% =  int(len(DataFile)*0.1) => 48
- validation 10% = int(len(DataFile)*0.1)

In [72]:
time_step = 3
X,y = X_y_forecasting_splits(DataFile,time_step)

scaler = MinMaxScaler()
y_scaled = scaler.fit_transform(y.reshape(-1,1)).flatten()

In [252]:
INPUT_SHAPE = (time_step,3)
LR = 0.1
EPOCHS = 80
N_SPLITS = 3
CALLBACK = [
    ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=20,
    min_delta=0.0005,
    min_lr=1e-6,
    ),
    EarlyStopping(
    monitor="val_loss",
    patience=20,
    min_delta=0.0001,
    )]

train_size = int(len(DataFile)*0.8) + 1 
test_val_size = int(len(DataFile)*0.1)

X_train,y_train = X[:train_size],y[:train_size]
X_test,y_test= X[train_size:train_size+test_val_size],y[train_size:train_size+test_val_size]
X_val,y_val = X[train_size+test_val_size:],y[train_size+test_val_size:]

print(f'train size is : {train_size}, test val size is : {test_val_size}')
print(f'train : {X_train.shape} , {y_train.shape}')
print(f'test : {X_test.shape} , {y_test.shape}')
print(f'val : {X_val.shape} , {y_val.shape}')

### LSTM

In [253]:
time_series_split_folds = TimeSeriesSplit(n_splits=N_SPLITS)
performance = []
for fold ,(training_idx, validation_idx) in enumerate(time_series_split_folds.split(X,y)):
    lstm_model = lstm_architecture(INPUT_SHAPE,LR)

    X_train_cv = tf.convert_to_tensor(X[training_idx], dtype=tf.float32)
    X_val_cv = tf.convert_to_tensor(X[validation_idx], dtype=tf.float32)
    y_train_cv = tf.convert_to_tensor(y_scaled[training_idx], dtype=tf.float32)
    y_val_cv = tf.convert_to_tensor(y_scaled[validation_idx], dtype=tf.float32)
    
    lstm_model.fit(X_train_cv,y_train_cv,epochs=EPOCHS,validation_data=(X_val_cv,y_val_cv),callbacks=CALLBACK,verbose=1)
    val_loss, val_mae = lstm_model.evaluate(X_val_cv, y_val_cv,verbose=0)
    
    y_val_preds = lstm_model.predict(X_val_cv)
    y_val_preds = scaler.inverse_transform(y_val_preds).flatten()
    y_val_original = scaler.inverse_transform(y_val_cv.numpy().reshape(-1,1)).flatten()
    
    mae_original = np.mean(np.abs(y_val_preds - y_val_original))
    
    performance.append({
    "fold": fold,
    "val_loss": f'{val_loss:.4f}',
    "val_mae_scaled": f'{val_mae:.4f}',
    "val_mae_original": f'{mae_original:.4f}',
    })    
    
    print(f'Fold {fold} , val_loss is : {val_loss:.2f}, MAE scaled is : {val_mae:.2f}, MAE original is : {mae_original:.2f}') 

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Fold 0 , val_loss is : 0.03, MAE scaled is : 0.15, MAE original is : 12.74
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Fold 1 , val_loss is : 0.12, MAE scaled is : 0.33, MAE original is : 27.95
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
F

In [254]:
performance = pd.DataFrame(performance)
print(performance)

   fold val_loss val_mae_scaled val_mae_original
0     0   0.0297         0.1538          12.7443
1     1   0.1198         0.3350          27.9451
2     2   0.1408         0.3541          30.6096


for the lstm model , thats the max we could have

### Transformer

#### First : Positional Encoding

In [73]:
def positional_encoding(d_model,sequence_length=3,n=10000):
    """
    d_model: the dimension of our input ( output of the embedding space )
    sequence_length: the length of our sequence for example we have 3 features then its 3
    """
    PosEnc = np.zeros((sequence_length,d_model))
    indices = np.arange(int(d_model/2))
    positions = np.arange((sequence_length))
    
    for position in positions:
        for index in indices:
            denomenator = np.power(n,2*index/d_model)
            PosEnc[position,2*index] = np.sin(position/denomenator)
            PosEnc[position,2*index+1] = np.cos(position/denomenator)
    return PosEnc

In [74]:
d_model = 64
sequence_length = 3

PosEnc = positional_encoding(d_model,sequence_length)
print(PosEnc.shape)

(3, 64)


- We use **the embedding** which is a way to convert raw input into a high-dimensional vector (d_model in this case).
so our embedding vector shape is **(batch_size,time,features_embedding)**
- **Positional encoding** is added to this embedding vector so the model knows about the position of each token or feature.

In [76]:
X_transformer = tf.convert_to_tensor(X, dtype=tf.float32)
embedded = Dense(d_model)(X_transformer)
print(embedded.shape)
X_embedded_positioned = embedded + PosEnc
print(X_embedded_positioned.shape)

(483, 3, 64)
(483, 3, 64)
