# CUBEM 2019 Part 2 Preprocessing



In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
warnings.filterwarnings('ignore')


plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## Load and Combine Floor Data

We'll load floors 3, 4, and 5 and combine them into a single dataframe for efficient preprocessing.


In [2]:

print("Loading floor data...")
df_floor3 = pd.read_csv('../data/cu-bem/2019Floor3.csv/2019Floor3.csv')
df_floor4 = pd.read_csv('../data/cu-bem/2019Floor4.csv/2019Floor4.csv')
df_floor5 = pd.read_csv('../data/cu-bem/2019Floor5.csv/2019Floor5.csv')

print(f"Floor 3 shape: {df_floor3.shape}")
print(f"Floor 4 shape: {df_floor4.shape}")
print(f"Floor 5 shape: {df_floor5.shape}")


df_floor3['floor'] = 3
df_floor4['floor'] = 4
df_floor5['floor'] = 5

# Combine all floors into one dataframe
df_combined = pd.concat([df_floor3, df_floor4, df_floor5], ignore_index=True)

print(f"\nCombined dataset shape: {df_combined.shape}")
print(f"Floor distribution:")
print(df_combined['floor'].value_counts().sort_index())


Loading floor data...
Floor 3 shape: (525600, 30)
Floor 4 shape: (525600, 30)
Floor 5 shape: (525600, 30)

Combined dataset shape: (1576800, 31)
Floor distribution:
floor
3    525600
4    525600
5    525600
Name: count, dtype: int64


## Step 1: Missing Values Handling


In [3]:
# Create a copy for preprocessing
df_processed = df_combined.copy()

print("Step 1: Missing Values Analysis")
print("="*50)
print(f"Initial shape: {df_processed.shape}")


print("\nMissing values per column:")
missing_per_column = df_processed.isnull().sum()
print(missing_per_column[missing_per_column > 0])

print(f"\nTotal missing values: {df_processed.isnull().sum().sum()}")
print(f"Percentage of missing values: {(df_processed.isnull().sum().sum() / (df_processed.shape[0] * df_processed.shape[1])) * 100:.2f}%")

# Fill missing values
print("\nFilling missing values...")

# For numeric columns, fill with median
numeric_columns = df_processed.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    if col != 'floor':
        missing_count = df_processed[col].isnull().sum()
        if missing_count > 0:
            median_value = df_processed[col].median()
            df_processed[col] = df_processed[col].fillna(median_value)
            print(f"Filled {missing_count} missing values in '{col}' with median: {median_value:.2f}")

# For categorical columns, fill with mode
categorical_columns = df_processed.select_dtypes(include=['object']).columns
for col in categorical_columns:
    missing_count = df_processed[col].isnull().sum()
    if missing_count > 0:
        mode_value = df_processed[col].mode()[0] if not df_processed[col].mode().empty else 'Unknown'
        df_processed[col] = df_processed[col].fillna(mode_value)
        print(f"Filled {missing_count} missing values in '{col}' with mode: '{mode_value}'")

print(f"\nMissing values after filling: {df_processed.isnull().sum().sum()}")


Step 1: Missing Values Analysis
Initial shape: (1576800, 31)

Missing values per column:
z1_AC1(kW)      144704
z1_AC2(kW)      144704
z1_AC3(kW)      144704
z1_AC4(kW)       78100
z1_Light(kW)     34230
z1_Plug(kW)       1789
z1_S1(degC)     394436
z1_S1(RH%)      394435
z1_S1(lux)      394439
z2_AC1(kW)       78035
z2_Light(kW)     55488
z2_Plug(kW)      54799
z2_S1(degC)     442499
z2_S1(RH%)      442503
z2_S1(lux)      442499
z3_Light(kW)      3347
z3_Plug(kW)       2050
z4_AC1(kW)       77884
z4_Light(kW)     13341
z4_Plug(kW)       3440
z4_S1(degC)     421892
z4_S1(RH%)      421893
z4_S1(lux)      421890
z5_AC1(kW)       77974
z5_Light(kW)     13349
z5_Plug(kW)       3787
z5_S1(degC)     358920
z5_S1(RH%)      358920
z5_S1(lux)      358919
dtype: int64

Total missing values: 5784970
Percentage of missing values: 11.83%

Filling missing values...
Filled 144704 missing values in 'z1_AC1(kW)' with median: 0.00
Filled 144704 missing values in 'z1_AC2(kW)' with median: 0.00
Filled 144

## Step 2: Outlier Detection and Treatment


In [4]:
# Outlier Detection using Multiple Methods
print("Step 2: Outlier Detection and Treatment")
print("="*50)

# Get numeric columns (excluding floor)
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
numeric_cols = [col for col in numeric_cols if col != 'floor']

print(f"Numeric columns for outlier analysis: {list(numeric_cols)}")

# Method 1: IQR Method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Method 2: Z-Score Method
def detect_outliers_zscore(data, column, threshold=3):
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    outliers = data[column][z_scores > threshold]
    return outliers

# Method 3: Isolation Forest
def detect_outliers_isolation_forest(data, columns, contamination=0.1):
    iso_forest = IsolationForest(contamination=contamination, random_state=42)
    outlier_labels = iso_forest.fit_predict(data[columns])
    outliers = data[outlier_labels == -1]
    return outliers

print("\nOutlier Detection Results:")
print("-" * 30)

outlier_summary = {}

for col in numeric_cols:
    print(f"\nColumn: {col}")
    
    # IQR Method
    outliers_iqr, lower, upper = detect_outliers_iqr(df_processed, col)
    print(f"  IQR Method: {len(outliers_iqr)} outliers (bounds: {lower:.2f} to {upper:.2f})")
    
    # Z-Score Method
    outliers_zscore = detect_outliers_zscore(df_processed, col)
    print(f"  Z-Score Method: {len(outliers_zscore)} outliers")
    
    outlier_summary[col] = {
        'iqr_count': len(outliers_iqr),
        'zscore_count': len(outliers_zscore),
        'iqr_bounds': (lower, upper)
    }

# Isolation Forest for all numeric columns
print(f"\nIsolation Forest (all numeric columns):")
outliers_iso = detect_outliers_isolation_forest(df_processed, numeric_cols)
print(f"  Total outliers detected: {len(outliers_iso)}")


Step 2: Outlier Detection and Treatment
Numeric columns for outlier analysis: ['z1_AC1(kW)', 'z1_AC2(kW)', 'z1_AC3(kW)', 'z1_AC4(kW)', 'z1_Light(kW)', 'z1_Plug(kW)', 'z1_S1(degC)', 'z1_S1(RH%)', 'z1_S1(lux)', 'z2_AC1(kW)', 'z2_Light(kW)', 'z2_Plug(kW)', 'z2_S1(degC)', 'z2_S1(RH%)', 'z2_S1(lux)', 'z3_Light(kW)', 'z3_Plug(kW)', 'z4_AC1(kW)', 'z4_Light(kW)', 'z4_Plug(kW)', 'z4_S1(degC)', 'z4_S1(RH%)', 'z4_S1(lux)', 'z5_AC1(kW)', 'z5_Light(kW)', 'z5_Plug(kW)', 'z5_S1(degC)', 'z5_S1(RH%)', 'z5_S1(lux)']

Outlier Detection Results:
------------------------------

Column: z1_AC1(kW)
  IQR Method: 1934 outliers (bounds: 0.00 to 0.00)
  Z-Score Method: 1162 outliers

Column: z1_AC2(kW)
  IQR Method: 17532 outliers (bounds: 0.00 to 0.00)
  Z-Score Method: 4440 outliers

Column: z1_AC3(kW)
  IQR Method: 11901 outliers (bounds: 0.00 to 0.00)
  Z-Score Method: 5572 outliers

Column: z1_AC4(kW)
  IQR Method: 385356 outliers (bounds: 0.00 to 0.00)
  Z-Score Method: 21997 outliers

Column: z1_Light(kW

In [5]:



print("Option 1: Capping outliers using IQR method...")
df_capped = df_processed.copy()

for col in numeric_cols:
    Q1 = df_capped[col].quantile(0.25)
    Q3 = df_capped[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Cap outliers
    df_capped[col] = df_capped[col].clip(lower=lower_bound, upper=upper_bound)
    
    original_outliers = outlier_summary[col]['iqr_count']
    print(f"  {col}: Capped {original_outliers} outliers")

print(f"\nShape after capping: {df_capped.shape}")



# Use capped data for further processing
df_processed = df_capped
print(f"\nUsing capped data for further processing. Final shape: {df_processed.shape}")


Option 1: Capping outliers using IQR method...
  z1_AC1(kW): Capped 1934 outliers
  z1_AC2(kW): Capped 17532 outliers
  z1_AC3(kW): Capped 11901 outliers
  z1_AC4(kW): Capped 385356 outliers
  z1_Light(kW): Capped 243 outliers
  z1_Plug(kW): Capped 206614 outliers
  z1_S1(degC): Capped 286812 outliers
  z1_S1(RH%): Capped 99159 outliers
  z1_S1(lux): Capped 463 outliers
  z2_AC1(kW): Capped 308116 outliers
  z2_Light(kW): Capped 149373 outliers
  z2_Plug(kW): Capped 153543 outliers
  z2_S1(degC): Capped 297581 outliers
  z2_S1(RH%): Capped 108244 outliers
  z2_S1(lux): Capped 166931 outliers
  z3_Light(kW): Capped 499 outliers
  z3_Plug(kW): Capped 14977 outliers
  z4_AC1(kW): Capped 311401 outliers
  z4_Light(kW): Capped 148 outliers
  z4_Plug(kW): Capped 92394 outliers
  z4_S1(degC): Capped 310858 outliers
  z4_S1(RH%): Capped 142256 outliers
  z4_S1(lux): Capped 318653 outliers
  z5_AC1(kW): Capped 312571 outliers
  z5_Light(kW): Capped 291 outliers
  z5_Plug(kW): Capped 27669 outli

## Step 3: Feature Engineering
