In [1]:
!pip install ucimlrepo

python(46218) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.




In [25]:
#Importing required libraries
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split

# Dataset 1: Bike Sharing

In [3]:
# fetch dataset 
bike_sharing = fetch_ucirepo(id=275) 
  
# data (as pandas dataframes) 
X_bike = bike_sharing.data.features 
y_bike = bike_sharing.data.targets 

bike_df = pd.concat([X_bike, y_bike], axis=1)


Characteristic 1 & 2.

In [4]:
num_variables = bike_df.shape[1]
num_records = bike_df.shape[0]
print(f"Number of variables (columns): {num_variables}")
print(f"Number of records (rows): {num_records}\n")

Number of variables (columns): 14
Number of records (rows): 17379



Characteristic 3.

In [5]:
bike_df.head(10)

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1
5,2011-01-01,1,0,1,5,0,6,0,2,0.24,0.2576,0.75,0.0896,1
6,2011-01-01,1,0,1,6,0,6,0,1,0.22,0.2727,0.8,0.0,2
7,2011-01-01,1,0,1,7,0,6,0,1,0.2,0.2576,0.86,0.0,3
8,2011-01-01,1,0,1,8,0,6,0,1,0.24,0.2879,0.75,0.0,8
9,2011-01-01,1,0,1,9,0,6,0,1,0.32,0.3485,0.76,0.0,14


In [6]:
print("Data types of each column:")
print(bike_df.dtypes)
print("\n")

Data types of each column:
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
cnt             int64
dtype: object




| Column      | Type         | Subtype            | Explanation |
|-------------|--------------|--------------------|-------------|
| index       | Categorical  | Nominal            | Row identifier with no numeric meaning |
| dteday      | Categorical  | Nominal (date)     | Treated as a label unless decomposed |
| season      | Categorical  | Ordinal            | Encoded seasons with natural order (1–4) |
| yr          | Categorical  | Ordinal            | 0 = 2011, 1 = 2012 (progression in time) |
| mnth        | Categorical  | Ordinal            | Months have natural order (1–12) |
| hr          | Categorical  | Ordinal            | Hours of day (0–23) with natural order |
| holiday     | Categorical  | Binary Nominal     | Holiday flag (0/1) |
| weekday     | Categorical  | Nominal            | Days of week (0–6), no natural order |
| workingday  | Categorical  | Binary Nominal     | Working day flag (0/1) |
| weathersit  | Categorical  | Ordinal            | Weather categories ranked by severity (1–4) |
| temp        | Numerical    | Continuous         | Normalized temperature |
| atemp       | Numerical    | Continuous         | Normalized “feels‑like” temperature |
| hum         | Numerical    | Continuous         | Normalized humidity |
| windspeed   | Numerical    | Continuous         | Normalized windspeed |
| cnt         | Numerical    | Discrete           | Count of total bike rentals |


Characteristic 4.

In [7]:

# Generate descriptive (summary) statistics for bike_df.
print("Descriptive (summary) statistics:")
print(bike_df.describe())
print("\n")

Descriptive (summary) statistics:
             season            yr          mnth            hr       holiday  \
count  17379.000000  17379.000000  17379.000000  17379.000000  17379.000000   
mean       2.501640      0.502561      6.537775     11.546752      0.028770   
std        1.106918      0.500008      3.438776      6.914405      0.167165   
min        1.000000      0.000000      1.000000      0.000000      0.000000   
25%        2.000000      0.000000      4.000000      6.000000      0.000000   
50%        3.000000      1.000000      7.000000     12.000000      0.000000   
75%        3.000000      1.000000     10.000000     18.000000      0.000000   
max        4.000000      1.000000     12.000000     23.000000      1.000000   

            weekday    workingday    weathersit          temp         atemp  \
count  17379.000000  17379.000000  17379.000000  17379.000000  17379.000000   
mean       3.003683      0.682721      1.425283      0.496987      0.475775   
std        2.0057

Characteristics 5

In [8]:
# Check for and print the total count of missing values for each column in bike_df.
missing_values = bike_df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found.\n")
else:
    print("\n")

# Check for and print the total count of duplicate rows in bike_df.
duplicate_rows = bike_df.duplicated().sum()
print(f"Total count of duplicate rows: {duplicate_rows}\n")

# Print the column names of bike_df to identify potentially irrelevant columns.
print("Column names in bike_df:")
print(bike_df.columns.tolist())
print("\n")


Missing values per column:
Series([], dtype: int64)
No missing values found.

Total count of duplicate rows: 0

Column names in bike_df:
['dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt']




Characteristics 6.

Balancing assessment: The target variable 'cnt' is a continuous numerical variable representing bike rental counts. This indicates a regression problem, not a classification problem. Therefore, dataset balancing (e.g., handling class imbalance) is not relevant for this task.

In [9]:
# this columns where identified thanks to the summary statistics

numerical_features = ['temp', 'atemp', 'hum', 'windspeed']
ordinal_features = ['hr', 'mnth', 'season', 'weathersit']
binary_features = ['holiday', 'workingday']  # do NOT scale
target = ['cnt']  # regression target

# Combine features to scale
features_to_scale = numerical_features + ordinal_features

scaler = MinMaxScaler()

bike_scaled_df = bike_df.copy()
bike_scaled_df[features_to_scale] = scaler.fit_transform(bike_df[features_to_scale])

bike_scaled_df['cnt'] = bike_df['cnt']

bike_scaled_df.head()


Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,0.0,0,0.0,0.0,0,6,0,0.0,0.22449,0.2879,0.81,0.0,16
1,2011-01-01,0.0,0,0.0,0.043478,0,6,0,0.0,0.204082,0.2727,0.8,0.0,40
2,2011-01-01,0.0,0,0.0,0.086957,0,6,0,0.0,0.204082,0.2727,0.8,0.0,32
3,2011-01-01,0.0,0,0.0,0.130435,0,6,0,0.0,0.22449,0.2879,0.75,0.0,13
4,2011-01-01,0.0,0,0.0,0.173913,0,6,0,0.0,0.22449,0.2879,0.75,0.0,1


Characteristic 7.

In [10]:
train_size = int(0.70 * num_records)
test_size = num_records - train_size
print(f"If a 70/30 split were performed:")
print(f"  Training set records (70%): {train_size}")
print(f"  Test set records (30%): {test_size}")

If a 70/30 split were performed:
  Training set records (70%): 12165
  Test set records (30%): 5214


Dataset preprocessing:
* Convert to supervised learning (sliding window)
* Create sequences (e.g., 24‑hour window → next hour prediction)

In [None]:
# Convert scaled bike dataframe to numpy array
bike_data = bike_scaled_df.values

# Define lookback window (24 hours for hourly data)
lookback_window = 24

# Create sequences using sliding window
X_bike_seq, y_bike_seq = [], []

for i in range(len(bike_data) - lookback_window):
    # Input: 24-hour window (all features except target)
    X_bike_seq.append(bike_data[i:i + lookback_window, :-1])
    # Target: next hour's bike count (cnt)
    y_bike_seq.append(bike_data[i + lookback_window, -1])

X_bike_seq = np.array(X_bike_seq)
y_bike_seq = np.array(y_bike_seq)

print(f"Sequence shape - X: {X_bike_seq.shape}, y: {y_bike_seq.shape}")
print(f"Number of sequences: {X_bike_seq.shape[0]}")
print(f"Time steps per sequence: {X_bike_seq.shape[1]}")
print(f"Number of features: {X_bike_seq.shape[2]}\n")


## Technique 1: Vanilla RNN

### TensorFlow

### PyTorch

## Technique 2: RestNet

### TensorFlow

### PyTorch

# Dataset 2: Individual Household Electric Power Consumption

In [12]:
# fetch dataset
individual_household_electric_power_consumption = fetch_ucirepo(id=235)

# data (as pandas dataframes)
electric_X = individual_household_electric_power_consumption.data.features
electric_y = individual_household_electric_power_consumption.data.targets
electric_df = pd.concat([electric_X, electric_y], axis=1)

  df = pd.read_csv(data_url)


Characteristic 1 & 2.

In [13]:
num_variables = electric_df.shape[1]
num_records = electric_df.shape[0]
print(f"Number of variables (columns): {num_variables}")
print(f"Number of records (rows): {num_records}\n")

Number of variables (columns): 9
Number of records (rows): 2075259



Characteristic 3.

In [14]:
electric_df.head(10)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
5,16/12/2006,17:29:00,3.52,0.522,235.02,15.0,0.0,2.0,17.0
6,16/12/2006,17:30:00,3.702,0.52,235.09,15.8,0.0,1.0,17.0
7,16/12/2006,17:31:00,3.7,0.52,235.22,15.8,0.0,1.0,17.0
8,16/12/2006,17:32:00,3.668,0.51,233.99,15.8,0.0,1.0,17.0
9,16/12/2006,17:33:00,3.662,0.51,233.86,15.8,0.0,2.0,16.0


In [16]:
print("Data types of each column:")
print(electric_df.dtypes)
print("\n")

Data types of each column:
Date                      object
Time                      object
Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object




some columns are set to objects, rather than a numerical type, due to this they have to be parse to they can be taken into account as the type of data they actually are

In [18]:
numeric_cols = [
    'Global_active_power',
    'Global_reactive_power',
    'Voltage',
    'Global_intensity',
    'Sub_metering_1',
    'Sub_metering_2',
    'Sub_metering_3'
]
electric_df[numeric_cols] = electric_df[numeric_cols].apply(
    pd.to_numeric,
    errors='coerce'  # invalid values (e.g. '?') become NaN
)

| Column                | Type        | Subtype        | Explanation |
|-----------------------|-------------|----------------|-------------|
| Date                  | Categorical | Nominal        | Calendar date label, no numeric meaning |
| Time                  | Categorical | Nominal        | Time-of-day label, no numeric meaning |
| Global_active_power   | Numerical   | Continuous     | Real-valued household active power consumption (kW) |
| Global_reactive_power | Numerical   | Continuous     | Real-valued reactive power consumption (kW) |
| Voltage               | Numerical   | Continuous     | Real-valued voltage measurement (volts) |
| Global_intensity      | Numerical   | Continuous     | Real-valued current intensity (amps) |
| Sub_metering_1        | Numerical   | Discrete       | Integer energy consumption count for kitchen appliances |
| Sub_metering_2        | Numerical   | Discrete       | Integer energy consumption count for laundry appliances |
| Sub_metering_3        | Numerical   | Discrete       | Integer energy consumption count for water heater & AC |


Characteristic 4.

In [19]:

# Generate descriptive (summary) statistics for electric_df.
print("Descriptive (summary) statistics:")
print(electric_df.describe())
print("\n")

Descriptive (summary) statistics:
       Global_active_power  Global_reactive_power       Voltage  \
count         2.049280e+06           2.049280e+06  2.049280e+06   
mean          1.091615e+00           1.237145e-01  2.408399e+02   
std           1.057294e+00           1.127220e-01  3.239987e+00   
min           7.600000e-02           0.000000e+00  2.232000e+02   
25%           3.080000e-01           4.800000e-02  2.389900e+02   
50%           6.020000e-01           1.000000e-01  2.410100e+02   
75%           1.528000e+00           1.940000e-01  2.428900e+02   
max           1.112200e+01           1.390000e+00  2.541500e+02   

       Global_intensity  Sub_metering_1  Sub_metering_2  Sub_metering_3  
count      2.049280e+06    2.049280e+06    2.049280e+06    2.049280e+06  
mean       4.627759e+00    1.121923e+00    1.298520e+00    6.458447e+00  
std        4.444396e+00    6.153031e+00    5.822026e+00    8.437154e+00  
min        2.000000e-01    0.000000e+00    0.000000e+00    0.00000

Characteristics 5

In [20]:
# Check for and print the total count of missing values for each column in electric_df.
missing_values = electric_df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found.\n")
else:
    print("\n")

# Check for and print the total count of duplicate rows in electric_df.
duplicate_rows = electric_df.duplicated().sum()
print(f"Total count of duplicate rows: {duplicate_rows}\n")

# Print the column names of electric_df to identify potentially irrelevant columns.
print("Column names in electric_df:")
print(electric_df.columns.tolist())
print("\n")


Missing values per column:
Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64


Total count of duplicate rows: 0

Column names in electric_df:
['Date', 'Time', 'Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']




Characteristics 6.

Balancing assessment: The target variable 'Global_active_power' is a continuous numerical variable representing household power consumption in kilowatts. This indicates a regression problem, not a classification problem. Therefore, dataset balancing (e.g., handling class imbalance) is not relevant for this task.

In [21]:
# this columns where identified thanks to the summary statistics

numeric_features = [ 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3' ]


scaler = MinMaxScaler()

electric_scaled_df = electric_df.copy()
electric_scaled_df[numeric_features] = scaler.fit_transform(electric_df[numeric_features])

electric_scaled_df['Global_active_power'] = electric_df['Global_active_power']

electric_scaled_df.head()


Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.300719,0.37609,0.377593,0.0,0.0125,0.548387
1,16/12/2006,17:25:00,5.36,0.313669,0.336995,0.473029,0.0,0.0125,0.516129
2,16/12/2006,17:26:00,5.374,0.358273,0.32601,0.473029,0.0,0.025,0.548387
3,16/12/2006,17:27:00,5.388,0.361151,0.340549,0.473029,0.0,0.0125,0.548387
4,16/12/2006,17:28:00,3.666,0.379856,0.403231,0.323651,0.0,0.0125,0.548387


Characteristic 7.

In [22]:
train_size = int(0.70 * num_records)
test_size = num_records - train_size
print(f"If a 70/30 split were performed:")
print(f"  Training set records (70%): {train_size}")
print(f"  Test set records (30%): {test_size}")

If a 70/30 split were performed:
  Training set records (70%): 1452681
  Test set records (30%): 622578


## Technique 1: Vanilla RNN

### TensorFlow

### PyTorch

## Technique 2: RestNet

### TensorFlow

### PyTorch