In [None]:
print('Data preprocessing started')

In [None]:
# reading data on google drive
from google.colab import drive
# Mount your Google Drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# Load CSV file from Google Drive
file_path = '/content/drive/MyDrive/other/data/Data for MSC Thesis/eth_householdgeovariables_y5.csv'
df = pd.read_csv(file_path)

print(df.head())

In [None]:
# ------------------------------
# DATA EXPLORATION: NULL AND OUTLIER PERCENTAGE
# ------------------------------

# ------------------------------
# Step 1: Dataset Overview
# ------------------------------
print("----- Dataset Info -----")
print(df.info())

print("\n----- Dataset Shape -----")
print(df.shape)

print("\n----- Statistical Summary -----")
print(df.describe())

# ------------------------------
# Step 2: Missing Values by Percentage
# ------------------------------
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print("\n----- Missing Values Summary -----")
print(missing_df.sort_values(by='Percentage', ascending=False))

# ------------------------------
# Step 3: Outlier Detection by Percentage (Numeric Columns Only)
# ------------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

outlier_summary = []
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Detect outliers
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    outlier_count = len(outliers)
    outlier_percent = (outlier_count / len(df)) * 100

    outlier_summary.append([col, outlier_count, outlier_percent])

outlier_df = pd.DataFrame(outlier_summary, columns=['Column', 'Outlier Count', 'Outlier %'])
print("\n----- Outlier Summary by Percentage -----")
print(outlier_df.sort_values(by='Outlier %', ascending=False))

In [None]:
# ------------------------------
# FULL PREPROCESSING PIPELINE WITH SMOTE
# ------------------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# ------------------------------
# Step 1: Drop irrelevant/empty columns
# ------------------------------
drop_cols = ['household_id'] + [col for col in df.columns if col.startswith('c2_')]
df = df.drop(columns=drop_cols)

# ------------------------------
# Step 2: Impute missing values
# ------------------------------
df['lat_dd_mod'].fillna(df['lat_dd_mod'].median(), inplace=True)
df['lon_dd_mod'].fillna(df['lon_dd_mod'].median(), inplace=True)

# ------------------------------
# Step 3: Handle outliers (>5% outliers)
# ------------------------------
outlier_cols = ['sq4', 'h2021_wetQ', 'dist_road', 'h2021_sen', 'sen_avg']

for col in outlier_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] > upper, upper,
                       np.where(df[col] < lower, lower, df[col]))

# ------------------------------
# Step 4: Encode categorical variables
# ------------------------------
categorical_cols = ['ssa_aez09', 'landcov']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# ------------------------------
# Step 5: Scale numeric features
# ------------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('suppress')  # exclude target
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# ------------------------------
# Step 6: Split features and target
# ------------------------------
X = df.drop('suppress', axis=1)
y = df['suppress']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------
# Step 7: Handle class imbalance using SMOTE
# ------------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Preprocessing complete with SMOTE!")
print(f"Original training set shape: {X_train.shape}, positives: {y_train.sum()}")
print(f"Resampled training set shape: {X_train_res.shape}, positives: {y_train_res.sum()}")
print(f"Test set shape: {X_test.shape}, positives: {y_test.sum()}")
