In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [32]:
def X_y_forecasting_splits(Datafile,time_steps):
    X,y = list(),list()
    for start in range(len(Datafile)):
        end = start+time_steps 
        if end>len(Datafile)-1:
            break
        X.append(Datafile.iloc[start:end].values)
        y.append(Datafile.iloc[end]["CO2 Emission"])
    return np.array(X),np.array(y)

In [33]:
DataFile = pd.read_csv("Emission.csv")

DataFile.head()

Unnamed: 0,Year-Month,CO2 Emission
0,1973-Jan,106.363
1,1973-Feb,101.76
2,1973-Mar,110.553
3,1973-Apr,104.734
4,1973-May,114.897


In [34]:
print(DataFile.isnull().sum())

Year-Month      0
CO2 Emission    0
dtype: int64


In [35]:
print(DataFile.duplicated().sum())

0


Alright, there are no null values and no duplicates but there is something wrong with the "Year-Month" column, it's better to split it into two

In [36]:
DataFile[['Year', 'Month']] = DataFile['Year-Month'].str.split('-', expand=True)


DataFile.drop(columns=['Year-Month'], inplace=True)

print(DataFile)

     CO2 Emission  Year Month
0         106.363  1973   Jan
1         101.760  1973   Feb
2         110.553  1973   Mar
3         104.734  1973   Apr
4         114.897  1973   May
..            ...   ...   ...
481       134.243  2013   Feb
482       153.078  2013   Mar
483       149.442  2013   Apr
484       156.356  2013   May
485       152.814  2013   Jun

[486 rows x 3 columns]


Now we need to perform visual analysis on our dataset, but first we need to create a csv of our new dataset

In [None]:
DataFile.to_csv("New Emission.csv", index=False)

In [None]:
DataFile = pd.read_csv("New Emission.csv")

DataFile.head()

In [None]:
print(DataFile.dtypes)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(DataFile["Year"], DataFile["CO2 Emission"], marker="o", linestyle="-", color="b")

# Labels and Title
plt.xlabel("Year")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission Over the Years")
plt.grid(True)

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=DataFile["Month"], y=DataFile["CO2 Emission"], palette="coolwarm")

# Labels and Title
plt.xlabel("Month")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Month")

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.barplot(x=DataFile["Year"], y=DataFile["CO2 Emission"], palette="coolwarm")

# Labels and Title
plt.xlabel("Year")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Year")

plt.xticks(rotation=45, ha="right")

# Show the plot
plt.show()

It's kind of a complex figure so we will group the years into ranges

In [None]:
min_year = DataFile["Year"].min()
max_year = DataFile["Year"].max()

print(min_year)
print(max_year)

In [None]:
bins = [1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015]

labels = ["1971-1975", "1976-1980", "1981-1985", "1986-1990", "1991-1995", "1996-2000", "2001-2005", "2006-2010", "2011-2015"]

DataFile["Year Range"] = pd.cut(DataFile["Year"], bins=bins, labels=labels, right=True)
print(DataFile[["Year", "Year Range"]].head())

In [None]:
DataFile.head()

In [None]:
plt.figure(figsize=(12, 5))  # Increase width
sns.barplot(x=DataFile["Year Range"], y=DataFile["CO2 Emission"], palette="coolwarm")

plt.xlabel("Year Range")
plt.ylabel("CO2 Emission (ppm)")
plt.title("CO2 Emission by Year")

plt.xticks(rotation=45, ha="right")  # Rotate labels for better spacing

plt.show()


In [None]:
DataFile.drop(columns=['Year Range'], inplace=True)

In [7]:
DataFile.head()

Unnamed: 0,CO2 Emission,Year,Month
0,106.363,1973,Jan
1,101.76,1973,Feb
2,110.553,1973,Mar
3,104.734,1973,Apr
4,114.897,1973,May


### LSTM AND TRANSFORMERS


We have 486 rows so :
- train 80% = int(len(DataFile)*0.8) 388
- test 10% =  int(len(DataFile)*0.1) 48
- validation 10% = int(len(DataFile)*0.1) 48

In [None]:
time_step = 3
X,y = X_y_forecasting_splits(DataFile,time_step)
train_size = int(len(DataFile)*0.8) + 1 
test_val_size = int(len(DataFile)*0.1)

X_train,y_train = X[:train_size],y[:train_size]
X_test,y_test= X[train_size:train_size+test_val_size],y[train_size:train_size+test_val_size]
X_val,y_val = X[train_size+test_val_size:],y[train_size+test_val_size:]

print(f'train size is : {train_size}, test val size is : {test_val_size}')
print(f'train : {X_train.shape} , {y_train.shape}')
print(f'test : {X_test.shape} , {y_test.shape}')
print(f'val : {X_val.shape} , {y_val.shape}')

(483, 3, 3)
train size is : 389, test val size is : 48
train : (389, 3, 3) , (389,)
test : (48, 3, 3) , (48,)
val : (46, 3, 3) , (46,)


In [39]:
print(X_train[:4])
print(y_train[:3])

[[[106.363 '1973' 'Jan']
  [101.76 '1973' 'Feb']
  [110.553 '1973' 'Mar']]

 [[101.76 '1973' 'Feb']
  [110.553 '1973' 'Mar']
  [104.734 '1973' 'Apr']]

 [[110.553 '1973' 'Mar']
  [104.734 '1973' 'Apr']
  [114.897 '1973' 'May']]

 [[104.734 '1973' 'Apr']
  [114.897 '1973' 'May']
  [109.24 '1973' 'Jun']]]
[104.734 114.897 109.24 ]
