In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input,Dense,LSTM,Dropout
from tensorflow.keras.losses import MeanSquaredError,MeanAbsoluteError
from tensorflow.keras.optimizers import Adam

## you could delete this line here 
- i needed it cause im using wsl for tensorflow, but in normal windows, its possible to remove it 

In [None]:
import os
os.chdir('/tf-acno-projects/Project-Data-Mining')

In [None]:
def X_y_forecasting_splits(Datafile,time_steps):
    X,y = list(),list()
    for start in range(len(Datafile)):
        end = start+time_steps 
        if end>len(Datafile)-1:
            break
        X.append(Datafile.iloc[start:end].values)
        y.append(Datafile.iloc[end]["CO2 Emission"])
    return np.array(X),np.array(y)

In [None]:
def months_converter(DataFile):
    unique_months = DataFile['Month'].unique()
    months_dict = {
        month:idx+1 for idx,month in enumerate(unique_months)
    }
    DataFile['Month'] = DataFile['Month'].map(months_dict)

In [None]:
def lstm_architecture(INPUT_SHAPE,LR):
    input_layer = Input(shape=INPUT_SHAPE)
    hidden_layer = LSTM(32,activation='relu')(input_layer)
    dropout_layer = Dropout(0.4)(hidden_layer)
    output_layer = Dense(1,activation='linear')(dropout_layer)

    lstm_model = Model(input_layer,output_layer)
    lstm_model.summary()
    lstm_model.compile(optimizer=Adam(learning_rate=LR),loss=MeanSquaredError(),metrics=[MeanAbsoluteError()])
    return lstm_model

In [None]:
DataFile = pd.read_csv("Emission.csv")

DataFile.head()

In [None]:
print(DataFile.isnull().sum())

In [None]:
print(DataFile.duplicated().sum())

Alright, there are no null values and no duplicates but there is something wrong with the "Year-Month" column, it's better to split it into two

In [None]:
DataFile[['Year', 'Month']] = DataFile['Year-Month'].str.split('-', expand=True)

DataFile.drop(columns=['Year-Month'], inplace=True)

months_converter(DataFile)

for col in DataFile.columns:
    DataFile[col] = pd.to_numeric(DataFile[col],errors='coerce')
print(DataFile)

Now we need to perform visual analysis on our dataset, but first we need to create a csv of our new dataset

In [None]:
DataFile.to_csv("New Emission.csv", index=False)

In [None]:
DataFile = pd.read_csv("New Emission.csv")

DataFile.head()

In [None]:
print(DataFile.dtypes)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(DataFile["Year"], DataFile["CO2 Emission"], marker="o", linestyle="-", color="b")

# Labels and Title
plt.xlabel("Year")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission Over the Years")
plt.grid(True)

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=DataFile["Month"], y=DataFile["CO2 Emission"], palette="coolwarm")

# Labels and Title
plt.xlabel("Month")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Month")

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.barplot(x=DataFile["Year"], y=DataFile["CO2 Emission"], palette="coolwarm")

# Labels and Title
plt.xlabel("Year")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Year")

plt.xticks(rotation=45, ha="right")

# Show the plot
plt.show()

It's kind of a complex figure so we will group the years into ranges

In [None]:
min_year = DataFile["Year"].min()
max_year = DataFile["Year"].max()

print(min_year)
print(max_year)

In [None]:
bins = [1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015]

labels = ["1971-1975", "1976-1980", "1981-1985", "1986-1990", "1991-1995", "1996-2000", "2001-2005", "2006-2010", "2011-2015"]

DataFile["Year Range"] = pd.cut(DataFile["Year"], bins=bins, labels=labels, right=True)
print(DataFile[["Year", "Year Range"]].head())

In [None]:
DataFile.head()

In [None]:
plt.figure(figsize=(12, 5))  # Increase width
sns.barplot(x=DataFile["Year Range"], y=DataFile["CO2 Emission"], palette="coolwarm")

plt.xlabel("Year Range")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Year")

plt.xticks(rotation=45, ha="right")  # Rotate labels for better spacing

plt.show()


In [None]:
DataFile.drop(columns=['Year Range'], inplace=True)

In [None]:
DataFile.head()

### LSTM AND TRANSFORMERS


We have 486 rows so :
- train 80% = int(len(DataFile)*0.8)+1 => 389
- test 10% =  int(len(DataFile)*0.1) => 48
- validation 10% = int(len(DataFile)*0.1)

#### first we convert strings into numerical values 

In [None]:
time_step = 3
X,y = X_y_forecasting_splits(DataFile,time_step)

In [None]:
INPUT_SHAPE = (3,3)
LR = 0.01
EPOCHS = 100
N_SPLITS = 5

In [None]:
train_size = int(len(DataFile)*0.8) + 1 
test_val_size = int(len(DataFile)*0.1)

X_train,y_train = X[:train_size],y[:train_size]
X_test,y_test= X[train_size:train_size+test_val_size],y[train_size:train_size+test_val_size]
X_val,y_val = X[train_size+test_val_size:],y[train_size+test_val_size:]

print(f'train size is : {train_size}, test val size is : {test_val_size}')
print(f'train : {X_train.shape} , {y_train.shape}')
print(f'test : {X_test.shape} , {y_test.shape}')
print(f'val : {X_val.shape} , {y_val.shape}')

In [None]:
lstm_model = lstm_architecture(INPUT_SHAPE,LR)
time_series_split_folds = TimeSeriesSplit(n_splits=N_SPLITS)
performance = []
for fold ,(training_idx, validation_idx) in enumerate(time_series_split_folds.split(X,y)):
    
    X_train_cv = tf.convert_to_tensor(X[training_idx], dtype=tf.float32)
    X_val_cv = tf.convert_to_tensor(X[validation_idx], dtype=tf.float32)
    y_train_cv = tf.convert_to_tensor(y[training_idx], dtype=tf.float32)
    y_val_cv = tf.convert_to_tensor(y[validation_idx], dtype=tf.float32)
    
    lstm_model.fit(X_train,y_train,epochs=EPOCHS,validation_data=(X_val,y_val),verbose=1)
    val_loss, val_mae = lstm_model.evaluate(X_val, y_val,verbose=0)
    
    performance.append({
    "fold": fold,
    "val_loss": val_loss,
    "val_accuracy": val_mae,
    })    
    print(f'Fold {fold} , val_loss is : {val_loss:.2f} , MAE is : {val_mae:.2f}') 