# This script includes: 

## 1. Import libraries
## 2. Import data
## 3. Data pre-processing
    3.1 Drop date and month columns
    3.2 Ensure every set of observations is the same shape for each station
    3.3 Reshape data
    3.4 Split data
## 4. CNN model on unscaled data
## 5. CNN model on scaled data
    5.1 Scale the data
    5.2 Create CNN model
    5.3 Train model and evaluate
    5.4 Confusion matrices

## 1. Import libraries

In [3]:
# Import libraries

import pandas as pd
import numpy as np
import os
import seaborn as sns
import operator
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from numpy import unique
from numpy import reshape
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Dense, Dropout, BatchNormalization, Flatten, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

## 2. Import data

In [5]:
# Import data

path = r'C:\Users\ashle\Documents\10-2024 Climate Analysis'

df_unscaled = pd.read_csv(os.path.join(path, '02 Data Sets', 'Dataset-weather-prediction-dataset-processed.csv'))
df_pleasant = pd.read_csv(os.path.join(path, '02 Data Sets', 'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'))

I chose to use a Convolutional Neural Network (CNN) for this data because it lacks a temporal component, making a Recurrent Neural Network (RNN) unnecessary. While both models could work, RNNs store present data in memory, which increases computational demands. Using a CNN is a more efficient choice, helping ClimateWins conserve memory and computational resources.

In [7]:
df_unscaled.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [8]:
df_pleasant.head()

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,19600102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,19600103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19600104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,19600105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 3. Data pre-processing

### 3.1 Drop Date and Month columns

In [11]:
# Drop Date and Month columns from df_unscaled

df_unscaled = df_unscaled.drop(columns=['DATE', 'MONTH'])
df_unscaled.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,7,2.1,0.85,1.018,0.32,0.09,0,0.7,6.5,0.8,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,6.1,3.3,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,8,2.1,0.9,1.018,0.18,0.3,0,0.0,8.5,5.1,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,3,2.1,0.92,1.018,0.58,0.0,0,4.1,6.3,3.8,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,6,2.1,0.95,1.018,0.65,0.14,0,5.4,3.0,-0.7,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [12]:
# Drop Date and Month columns from df_pleasant

df_pleasant = df_pleasant.drop(columns=['DATE'])
df_pleasant.head()

Unnamed: 0,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### 3.2 Ensure every set of observations is the same length

In [14]:
print('df_unscaled shape: ', df_unscaled.shape)

df_unscaled shape:  (22950, 168)


In [15]:
print('df_pleasant shape: ', df_pleasant.shape)

df_pleasant shape:  (22950, 15)


In [16]:
# Drop columns in unscaled data set to align with the stations in "df_pleasant"

def drop_specific_columns(df): 
    # List of substrings to check for in column names
    substrings = ['GDANSK_', 'ROMA_', 'TOURS_']

    # Filter out columns containing any of the substrings
    columns_to_drop = [col for col in df.columns if any(sub in col for sub in substrings)]

    # Drop the identified columns and return the modified dataframe
    df = df.drop(columns=columns_to_drop)
    return df

In [17]:
# Apply drop_specific_columns function to the unscaled dataframe
df_unscaled = drop_specific_columns(df_unscaled)

In [18]:
print('df_unscaled shape: ', df_unscaled.shape)

df_unscaled shape:  (22950, 147)


In [19]:
# Observation types "wind_speed" and "snow_depth" are missing from most stations
# Remove the missing observation types. 

def remove_missing_observation_types(df):
    # Remove columns containing '_wind_speed' and '_snow_depth'
    df = df.loc[:, ~df.columns.str.contains('_wind_speed|_snow_depth')]
    return df

# Apply function to df_unscaled
df_unscaled = remove_missing_observation_types(df_unscaled)

In [20]:
df_unscaled.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,STOCKHOLM_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,7,0.85,1.018,0.32,0.09,0.7,6.5,0.8,10.9,1,...,4.9,5,0.88,1.0003,0.45,0.34,4.7,8.5,6.0,10.9
1,6,0.84,1.018,0.36,1.05,1.1,6.1,3.3,10.1,6,...,5.0,7,0.91,1.0007,0.25,0.84,0.7,8.9,5.6,12.1
2,8,0.9,1.018,0.18,0.3,0.0,8.5,5.1,9.9,6,...,4.1,7,0.91,1.0096,0.17,0.08,0.1,10.5,8.1,12.9
3,3,0.92,1.018,0.58,0.0,4.1,6.3,3.8,10.6,8,...,2.3,7,0.86,1.0184,0.13,0.98,0.0,7.4,7.3,10.6
4,6,0.95,1.018,0.65,0.14,5.4,3.0,-0.7,6.0,8,...,4.3,3,0.8,1.0328,0.46,0.0,5.7,5.7,3.0,8.4


- KASSEL is missing cloud_cover
- MUNCHENB is missing pressure
- STOCKHOLM is missing humidity

In [22]:
pd.set_option('display.max_columns', None)

# Print all column names
print(list(df_unscaled.columns))

['BASEL_cloud_cover', 'BASEL_humidity', 'BASEL_pressure', 'BASEL_global_radiation', 'BASEL_precipitation', 'BASEL_sunshine', 'BASEL_temp_mean', 'BASEL_temp_min', 'BASEL_temp_max', 'BELGRADE_cloud_cover', 'BELGRADE_humidity', 'BELGRADE_pressure', 'BELGRADE_global_radiation', 'BELGRADE_precipitation', 'BELGRADE_sunshine', 'BELGRADE_temp_mean', 'BELGRADE_temp_min', 'BELGRADE_temp_max', 'BUDAPEST_cloud_cover', 'BUDAPEST_humidity', 'BUDAPEST_pressure', 'BUDAPEST_global_radiation', 'BUDAPEST_precipitation', 'BUDAPEST_sunshine', 'BUDAPEST_temp_mean', 'BUDAPEST_temp_min', 'BUDAPEST_temp_max', 'DEBILT_cloud_cover', 'DEBILT_humidity', 'DEBILT_pressure', 'DEBILT_global_radiation', 'DEBILT_precipitation', 'DEBILT_sunshine', 'DEBILT_temp_mean', 'DEBILT_temp_min', 'DEBILT_temp_max', 'DUSSELDORF_cloud_cover', 'DUSSELDORF_humidity', 'DUSSELDORF_pressure', 'DUSSELDORF_global_radiation', 'DUSSELDORF_precipitation', 'DUSSELDORF_sunshine', 'DUSSELDORF_temp_mean', 'DUSSELDORF_temp_min', 'DUSSELDORF_temp_ma

In [23]:
df_unscaled.insert(54, 'KASSEL_cloud_cover', df_unscaled['LJUBLJANA_cloud_cover'])
df_unscaled.insert(92, 'MUNCHENB_pressure', df_unscaled['SONNBLICK_pressure'])
df_unscaled.insert(118, 'STOCKHOLM_humidity', df_unscaled['OSLO_humidity'])

In [24]:
pd.set_option('display.max_columns', None)

# Print all column names
print(list(df_unscaled.columns))

['BASEL_cloud_cover', 'BASEL_humidity', 'BASEL_pressure', 'BASEL_global_radiation', 'BASEL_precipitation', 'BASEL_sunshine', 'BASEL_temp_mean', 'BASEL_temp_min', 'BASEL_temp_max', 'BELGRADE_cloud_cover', 'BELGRADE_humidity', 'BELGRADE_pressure', 'BELGRADE_global_radiation', 'BELGRADE_precipitation', 'BELGRADE_sunshine', 'BELGRADE_temp_mean', 'BELGRADE_temp_min', 'BELGRADE_temp_max', 'BUDAPEST_cloud_cover', 'BUDAPEST_humidity', 'BUDAPEST_pressure', 'BUDAPEST_global_radiation', 'BUDAPEST_precipitation', 'BUDAPEST_sunshine', 'BUDAPEST_temp_mean', 'BUDAPEST_temp_min', 'BUDAPEST_temp_max', 'DEBILT_cloud_cover', 'DEBILT_humidity', 'DEBILT_pressure', 'DEBILT_global_radiation', 'DEBILT_precipitation', 'DEBILT_sunshine', 'DEBILT_temp_mean', 'DEBILT_temp_min', 'DEBILT_temp_max', 'DUSSELDORF_cloud_cover', 'DUSSELDORF_humidity', 'DUSSELDORF_pressure', 'DUSSELDORF_global_radiation', 'DUSSELDORF_precipitation', 'DUSSELDORF_sunshine', 'DUSSELDORF_temp_mean', 'DUSSELDORF_temp_min', 'DUSSELDORF_temp_ma

In [25]:
df_unscaled.shape

(22950, 135)

In [26]:
df_pleasant.shape

(22950, 15)

In [27]:
# Export cleaned data
df_unscaled.to_csv(os.path.join(path, '02 Data Sets', 'X_weather_data_cleaned_preprocessed.csv'),index=False)
df_pleasant.to_csv(os.path.join(path, '02 Data Sets', 'y_pleasant_cleaned_preprocessed.csv'),index=False)

### 3.3 Reshape data

In [29]:
# Convert X and y into numpy arrays
X = df_unscaled.to_numpy()
y = df_pleasant.to_numpy()

# Split X into 15 groups of 9 types of observations
X = X.reshape(-1,15,9)

# Check the shapes
print('X shape: ', X.shape) # Should be (22950, 15, 9)
print('y shape: ', y.shape) # Should be (22950, 15)

X shape:  (22950, 15, 9)
y shape:  (22950, 15)


### 3.4 Split data

In [31]:
# Split the data into two sets: a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [32]:
# Check shape of each test and train set
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16065, 15, 9)
(6885, 15, 9)
(16065, 15)
(6885, 15)


## 4. CNN model on unscaled data

In [34]:
X_train

array([[[ 2.    ,  0.65  ,  1.018 , ..., 20.5   , 11.9   , 27.7   ],
        [ 3.    ,  0.53  ,  1.0201, ..., 23.8   , 16.3   , 28.1   ],
        [ 4.    ,  0.67  ,  1.017 , ..., 21.6   , 14.5   , 29.    ],
        ...,
        [ 4.    ,  0.73  ,  1.0307, ...,  5.8   ,  1.6   ,  9.9   ],
        [ 5.    ,  0.93  ,  1.0156, ..., 17.9   , 13.3   , 22.8   ],
        [ 5.    ,  0.82  ,  1.0142, ..., 10.7   ,  7.9   , 13.5   ]],

       [[ 1.    ,  0.48  ,  1.018 , ..., 11.7   ,  2.5   , 19.3   ],
        [ 8.    ,  0.8   ,  1.0187, ..., 10.5   ,  8.3   , 14.2   ],
        [ 8.    ,  0.64  ,  1.0211, ...,  9.8   ,  4.5   , 15.4   ],
        ...,
        [ 1.    ,  0.56  ,  1.0322, ..., -6.1   , -8.4   , -3.7   ],
        [ 8.    ,  0.97  ,  1.0151, ...,  3.    , -0.2   ,  6.3   ],
        [ 7.    ,  0.88  ,  1.0093, ..., 10.7   ,  8.9   , 12.5   ]],

       [[ 6.    ,  0.92  ,  1.018 , ..., 12.4   ,  9.6   , 17.5   ],
        [ 3.    ,  0.62  ,  1.0265, ..., 10.9   ,  8.4   , 16.6   ],
    

In [35]:
len(X_train[0])

15

In [36]:
len(X_train[0][0])

9

In [37]:
epochs = 30
batch_size = 32
n_hidden = 16

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [38]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 849us/step - accuracy: 0.1195 - loss: 0.4717
Epoch 2/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 833us/step - accuracy: 0.1732 - loss: 0.2317
Epoch 3/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 833us/step - accuracy: 0.1690 - loss: 0.2134
Epoch 4/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 823us/step - accuracy: 0.1734 - loss: 0.2005
Epoch 5/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 817us/step - accuracy: 0.1776 - loss: 0.1882
Epoch 6/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 823us/step - accuracy: 0.1712 - loss: 0.1806
Epoch 7/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step - accuracy: 0.1780 - loss: 0.1760
Epoch 8/30
[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 813us/step - accuracy: 0.1800 - loss: 0.1670
Epoch 9/30
[1m503/503[

<keras.src.callbacks.history.History at 0x29a63765040>

## 5. CNN model on scaled data

### 5.1 Scale the data

In [42]:
# Scale the data
scaler = StandardScaler()

# Reshape for scaling: (samples, timesteps, features) -> (samples*timesteps, features)
X_train_reshaped = X_train.reshape(-1, X_train.shape[2])
X_test_reshaped = X_test.reshape(-1, X_test.shape[2])

# Scale and reshape back
X_train_scaled = scaler.fit_transform(X_train_reshaped).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test_reshaped).reshape(X_test.shape)

In [43]:
len(y_train[0])

15

### 5.2 Create CNN model

In [45]:
epochs = 30
batch_size = 64
n_hidden = 128
timesteps = len(X_train_scaled[0])
input_dim = len(X_train_scaled[0][0])
n_classes = len(y_train[0])

# Set up the model
model = Sequential()

# Add Conv1D layer
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))

# Add a fully connected Dense layer
model.add(Dense(16, activation='relu'))

# Add a MaxPooling layer
model.add(MaxPooling1D())

# Flatten the output for dense layers
model.add(Flatten())

# Final layer with 15 neurons, sigmoid activiation for binary classification per station
model.add(Dense(n_classes, activation='sigmoid'))

# Compile the model with binary crossentropy loss and the Adam optimizer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['f1_score'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### 5.3 Train model and evaluate

In [47]:
# Train the model
model.fit(X_train_scaled, y_train, batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - f1_score: 0.0869 - loss: 0.3822
Epoch 2/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - f1_score: 0.1577 - loss: 0.2031
Epoch 3/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 988us/step - f1_score: 0.1629 - loss: 0.1709
Epoch 4/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 992us/step - f1_score: 0.1740 - loss: 0.1490
Epoch 5/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 992us/step - f1_score: 0.1772 - loss: 0.1314
Epoch 6/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 992us/step - f1_score: 0.1845 - loss: 0.1203
Epoch 7/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 992us/step - f1_score: 0.1867 - loss: 0.1149
Epoch 8/30
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 988us/step - f1_score: 0.1903 - loss: 0.1039
Epoch 9/30
[1m252/252[0m 

<keras.src.callbacks.history.History at 0x29a68c86c30>

In [48]:
# Make predictions on the test set
predictions = model.predict(X_test_scaled)

# Print raw predictions (probabilities for each station)
print("Raw predictions:")
print(predictions)

# Threshold predictions for binary classification (set threshold at 0.5)
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)

# Print the binary predictions (0s and 1s for each station)
print("Binary predictions:")
print(binary_predictions)


[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step
Raw predictions:
[[7.1069849e-01 9.6744490e-01 1.8925735e-03 ... 1.0685947e-09
  4.0371133e-09 1.8780793e-05]
 [8.6357388e-10 1.4459071e-10 7.5933690e-11 ... 2.8637129e-10
  7.2698630e-16 4.2810411e-06]
 [8.9605427e-01 9.9203467e-01 9.0208972e-01 ... 1.8284405e-09
  2.6517832e-09 3.8527367e-03]
 ...
 [3.9318346e-07 5.9336663e-10 6.7580306e-09 ... 8.4795537e-10
  4.5863398e-16 4.1639611e-05]
 [1.1175044e-05 4.4077622e-09 2.9456177e-07 ... 6.7863010e-07
  1.8950817e-14 4.0005503e-04]
 [4.3189054e-04 7.3457096e-07 1.4023898e-07 ... 1.1664832e-08
  1.8156246e-12 5.5857521e-04]]
Binary predictions:
[[1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [49]:
# Run a classsification report

# Generate predictions on training set
train_predictions = model.predict(X_train_scaled)
train_binary_predictions = (train_predictions > threshold).astype(int)

# Print classification reports for training and test sets
print("Training Set Classification Report:")
print(classification_report(y_train, train_binary_predictions))

print("Test Set Classification Report:")
print(classification_report(y_test, binary_predictions))

[1m503/503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 548us/step
Training Set Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      3963
           1       0.96      0.98      0.97      5633
           2       0.97      0.97      0.97      5227
           3       0.93      0.96      0.95      3144
           4       0.97      0.97      0.97      3458
           5       0.97      0.96      0.96      3529
           6       0.96      0.93      0.95      2654
           7       0.97      0.98      0.97      4519
           8       0.96      0.95      0.95      3346
           9       0.99      0.98      0.99      7153
          10       0.96      0.96      0.96      3344
          11       0.97      0.98      0.98      2544
          12       0.00      0.00      0.00         0
          13       0.98      0.98      0.98      2723
          14       0.96      0.91      0.94       833

   micro avg       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 5.4 Confusion Matrices

In [51]:
# Define the labels (1 = pleasant, 0 = unpleasant)
labels = ['unpleasant', 'pleasant']

def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([labels[y] for y in Y_true.flatten()])
    Y_pred = pd.Series([labels[y] for y in Y_pred.flatten()])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])

# Evaluate and print the confusion matrix
y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)  # Apply threshold to get binary predictions
print(confusion_matrix(y_test, y_pred))

[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 582us/step
Pred        pleasant  unpleasant
True                            
pleasant       21161         785
unpleasant       908       80421


In [1]:
#Accuracy = (TP + TN) / (TP + TN + FP + FN)

Accuracy = (21161+80421) / (21161+80421+785+908)
print('Accuracy: ', Accuracy)

Accuracy:  0.9836068748487049


In [52]:
# Weather stations list
weather_stations = ['BASEL', 'BELGRADE', 'BUDAPEST', 'DEBILT', 'DUSSELDORF', 'HEATHROW', 
                    'KASSEL', 'LJUBLJANA', 'MAASTRICHT', 'MADRID', 'MUNCHENB', 'OSLO', 'SONNBLICK', 
                    'STOCKHOLM', 'VALENTIA']

# Define the labels
labels = ['pleasant', 'unpleasant']

# Loop over each weather station to calculate and print the confusion matrix
for i, station in enumerate(weather_stations):
    # Get the true labels for the current station (y_test[:, i])
    true_labels = y_test[:, i]  # Using the index of the weather station in y_test
    
    # Get the predicted labels for the current station (binary_predictions[:, i])
    predicted_labels = binary_predictions[:, i]
    
    # Create the confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    
    # Convert confusion matrix to DataFrame for better readability
    cm_df = pd.DataFrame(cm, columns=labels, index=labels)
    
    # Print the confusion matrix for this station
    print(f"Confusion Matrix for {station}:")
    print(cm_df)
    print("\n" + "-"*50 + "\n")

Confusion Matrix for BASEL:
            pleasant  unpleasant
pleasant        5065         119
unpleasant        73        1628

--------------------------------------------------

Confusion Matrix for BELGRADE:
            pleasant  unpleasant
pleasant        4449          77
unpleasant        42        2317

--------------------------------------------------

Confusion Matrix for BUDAPEST:
            pleasant  unpleasant
pleasant        4608          73
unpleasant        74        2130

--------------------------------------------------

Confusion Matrix for DEBILT:
            pleasant  unpleasant
pleasant        5455         115
unpleasant        39        1276

--------------------------------------------------

Confusion Matrix for DUSSELDORF:
            pleasant  unpleasant
pleasant        5354          55
unpleasant        67        1409

--------------------------------------------------

Confusion Matrix for HEATHROW:
            pleasant  unpleasant
pleasant        5393    