# Preprocessing the datasets

In [9]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo

seed = 42
np.random.seed(seed)
base_dir = 'c:/Users/deepa/Documents/Deeplearning CA/deep-learning-techniques-comparison-rnn-tcn'

In [10]:
bike_sharing = fetch_ucirepo(id=275)
X_bike = bike_sharing.data.features
y_bike = bike_sharing.data.targets
bike_df = pd.concat([X_bike, y_bike], axis=1)
bike_df = bike_df.drop(columns=['dteday'], errors='ignore')

har_dir = os.path.join(base_dir, 'human activity recognition using smartphones')
features_raw = []
with open(os.path.join(har_dir, 'features.txt'), 'r') as f:
    for line in f:
        features_raw.append(line.strip().split()[1])
features = [f"{f}_{i}" for i, f in enumerate(features_raw)]

X_train_har = pd.read_csv(os.path.join(har_dir, 'train/X_train.txt'), sep='\s+', header=None, names=features)
y_train_har = pd.read_csv(os.path.join(har_dir, 'train/y_train.txt'), sep='\s+', header=None, names=['activity'])
X_test_har = pd.read_csv(os.path.join(har_dir, 'test/X_test.txt'), sep='\s+', header=None, names=features)
y_test_har = pd.read_csv(os.path.join(har_dir, 'test/y_test.txt'), sep='\s+', header=None, names=['activity'])

X_har = pd.concat([X_train_har, X_test_har], ignore_index=True)
y_har = pd.concat([y_train_har, y_test_har], ignore_index=True)
har_df = pd.concat([X_har, y_har], axis=1)

power_dir = os.path.join(base_dir, 'individual household electric power consumption')
power_df = pd.read_csv(os.path.join(power_dir, 'household_power_consumption.csv'), sep=';')
power_df = power_df.drop(columns=['Date', 'Time'])
power_df.replace('?', np.nan, inplace=True)
power_df = power_df.ffill().bfill()
power_df = power_df.astype(float)

  power_df = pd.read_csv(os.path.join(power_dir, 'household_power_consumption.csv'), sep=';')


In [11]:
scaler_bike = MinMaxScaler()
bike_df_scaled = scaler_bike.fit_transform(bike_df.values)
bike_df = pd.DataFrame(bike_df_scaled, columns=bike_df.columns)

har_features = har_df.drop(columns=['activity']).values
scaler_har = StandardScaler()
har_features_scaled = scaler_har.fit_transform(har_features)
har_df = pd.concat([pd.DataFrame(har_features_scaled, columns=har_df.drop(columns=['activity']).columns), har_df[['activity']].reset_index(drop=True)], axis=1)

scaler_power = MinMaxScaler()
power_df_scaled = scaler_power.fit_transform(power_df.values)
power_df = pd.DataFrame(power_df_scaled, columns=power_df.columns)

In [12]:
X_bike = bike_df.iloc[:, :-1].values
y_bike = bike_df.iloc[:, -1].values
X_bike_temp, X_bike_test, y_bike_temp, y_bike_test = train_test_split(X_bike, y_bike, test_size=0.2, random_state=seed)
X_bike_train, X_bike_val, y_bike_train, y_bike_val = train_test_split(X_bike_temp, y_bike_temp, test_size=0.25, random_state=seed)

X_har = har_df.drop(columns=['activity']).values
y_har = (har_df['activity'].values - 1).astype(int)
X_har_temp, X_har_test, y_har_temp, y_har_test = train_test_split(X_har, y_har, test_size=0.2, random_state=seed)
X_har_train, X_har_val, y_har_train, y_har_val = train_test_split(X_har_temp, y_har_temp, test_size=0.25, random_state=seed)

n_samples = len(power_df)
train_idx = int(0.6 * n_samples)
val_idx = int(0.8 * n_samples)
power_data = power_df.values
X_power_train, X_power_val, X_power_test = power_data[:train_idx], power_data[train_idx:val_idx], power_data[val_idx:]

In [13]:
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

bike_combined = np.column_stack([np.vstack([X_bike_train, X_bike_val]), np.hstack([y_bike_train, y_bike_val])])
X_bike_seq, y_bike_seq = create_sequences(bike_combined, 10)
split_idx = int(0.75 * len(X_bike_seq))
X_bike_train_seq, X_bike_val_seq = X_bike_seq[:split_idx], X_bike_seq[split_idx:]
y_bike_train_seq, y_bike_val_seq = y_bike_seq[:split_idx, -1], y_bike_seq[split_idx:, -1]
X_bike_test_seq, y_bike_test_seq = create_sequences(np.column_stack([X_bike_test, y_bike_test]), 10)
y_bike_test_seq = y_bike_test_seq[:, -1]

har_train_val = np.column_stack([np.vstack([X_har_train, X_har_val]), np.hstack([y_har_train, y_har_val])])
X_har_seq, y_har_seq = create_sequences(har_train_val, 128)
y_har_seq = y_har_seq[:, -1].astype(int)
split_idx = int(0.75 * len(X_har_seq))
X_har_train_seq, X_har_val_seq = X_har_seq[:split_idx], X_har_seq[split_idx:]
y_har_train_seq, y_har_val_seq = y_har_seq[:split_idx], y_har_seq[split_idx:]
har_test = np.column_stack([X_har_test, y_har_test])
X_har_test_seq, y_har_test_seq = create_sequences(har_test, 128)
y_har_test_seq = y_har_test_seq[:, -1].astype(int)

power_combined = np.vstack([X_power_train, X_power_val])
X_power_seq, y_power_seq = create_sequences(power_combined, 24)
split_idx = int(0.75 * len(X_power_seq))
X_power_train_seq, X_power_val_seq = X_power_seq[:split_idx], X_power_seq[split_idx:]
y_power_train_seq, y_power_val_seq = y_power_seq[:split_idx], y_power_seq[split_idx:]
X_power_test_seq, y_power_test_seq = create_sequences(X_power_test, 24)

In [14]:
preprocessed_dir = os.path.join(base_dir, 'preprocessed_data')
os.makedirs(preprocessed_dir, exist_ok=True)

bike_data = {
    'X_train': X_bike_train, 'X_val': X_bike_val, 'X_test': X_bike_test,
    'y_train': y_bike_train, 'y_val': y_bike_val, 'y_test': y_bike_test,
    'X_train_seq': X_bike_train_seq, 'X_val_seq': X_bike_val_seq, 'X_test_seq': X_bike_test_seq,
    'y_train_seq': y_bike_train_seq, 'y_val_seq': y_bike_val_seq, 'y_test_seq': y_bike_test_seq,
    'scaler': scaler_bike
}
with open(os.path.join(preprocessed_dir, 'bike_sharing.pkl'), 'wb') as f:
    pickle.dump(bike_data, f)

har_data = {
    'X_train': X_har_train, 'X_val': X_har_val, 'X_test': X_har_test,
    'y_train': y_har_train, 'y_val': y_har_val, 'y_test': y_har_test,
    'X_train_seq': X_har_train_seq, 'X_val_seq': X_har_val_seq, 'X_test_seq': X_har_test_seq,
    'y_train_seq': y_har_train_seq, 'y_val_seq': y_har_val_seq, 'y_test_seq': y_har_test_seq,
    'scaler': scaler_har
}
with open(os.path.join(preprocessed_dir, 'har.pkl'), 'wb') as f:
    pickle.dump(har_data, f)

power_data_pkl = {
    'X_train': X_power_train, 'X_val': X_power_val, 'X_test': X_power_test,
    'X_train_seq': X_power_train_seq, 'X_val_seq': X_power_val_seq, 'X_test_seq': X_power_test_seq,
    'y_train_seq': y_power_train_seq, 'y_val_seq': y_power_val_seq, 'y_test_seq': y_power_test_seq,
    'scaler': scaler_power
}
with open(os.path.join(preprocessed_dir, 'power_consumption.pkl'), 'wb') as f:
    pickle.dump(power_data_pkl, f)

print("Preprocessing complete. Data saved to preprocessed_data/")

Preprocessing complete. Data saved to preprocessed_data/
