In [8]:
# ================================================================
# DATA CLEANING PIPELINE (Tailored for Investigation A Hypothesis Testing)
# Author: HIT140 Team
# ================================================================
import pandas as pd
import numpy as np
import os

# Create folder if not exists
os.makedirs("cleaned_dataset", exist_ok=True)

# ================================================================
# Load raw datasets
# ================================================================
df1 = pd.read_csv("raw_data/dataset1.csv")
df2 = pd.read_csv("raw_data/dataset2.csv")

# ================================================================
# CLEAN DATASET 1 (Bat Landing-level Data)
# ================================================================
print("\n=== Cleaning Dataset 1 ===")
df1_clean = df1.copy()

# Convert datetime columns
date_cols = ["start_time", "rat_period_start", "rat_period_end", "sunset_time"]
for col in date_cols:
    if col in df1_clean.columns:
        df1_clean[col] = pd.to_datetime(df1_clean[col], dayfirst=True, errors="coerce")

# Remove impossible negative times
df1_clean = df1_clean[df1_clean["bat_landing_to_food"] >= 0]

# Handle missing and clean habit column
df1_clean['habit'] = df1_clean['habit'].fillna("unknown").astype(str).str.strip().str.lower()
df1_clean['habit'] = df1_clean['habit'].replace({'': 'unknown', 'nan': 'unknown'})

# Ensure risk/reward are binary ints
df1_clean['risk'] = df1_clean['risk'].fillna(0).astype(int).clip(0, 1)
df1_clean['reward'] = df1_clean['reward'].fillna(0).astype(int).clip(0, 1)

# FEATURE 1: Avoidance behaviour (for H1)
df1_clean['avoidance_behavior'] = (df1_clean['bat_landing_to_food'] > 5).astype(int)

# FEATURE 2: Interaction type (for H2)
def classify_interaction(row):
    if "rat" not in str(row['habit']):
        return "no_rat"
    if row['reward'] == 1 and row['risk'] == 0:
        return "competition"
    if row['avoidance_behavior'] == 1 or row['risk'] == 1:
        return "predation_fear"
    return "uncertain"

df1_clean['interaction_type'] = df1_clean.apply(classify_interaction, axis=1)

# Rat presence flag (needed for splitting groups in H1 & H2)
df1_clean['rat_present'] = df1_clean['rat_present'] if 'rat_present' in df1_clean.columns else df1_clean['interaction_type'].apply(lambda x: 1 if x != 'no_rat' else 0)

print("Dataset 1 cleaned. Shape:", df1_clean.shape)

# ================================================================
# CLEAN DATASET 2 (Observation-level Data)
# ================================================================
print("\n=== Cleaning Dataset 2 ===")
df2_clean = df2.copy()

# Convert datetime
df2_clean['time'] = pd.to_datetime(df2_clean['time'], dayfirst=True, errors='coerce')

# Clip negative values to zero and fill missing
for col in ['bat_landing_number', 'food_availability', 'rat_minutes', 'rat_arrival_number']:
    df2_clean[col] = df2_clean[col].clip(lower=0).fillna(0)

# FEATURE: Rat presence flag (for H3)
df2_clean['rat_present'] = (df2_clean['rat_minutes'] > 0).astype(int)

print("Dataset 2 cleaned. Shape:", df2_clean.shape)

# ================================================================
# Save Cleaned Datasets
# ================================================================
df1_clean.to_csv("cleaned_dataset/dataset1_cleaned.csv", index=False)
df2_clean.to_csv("cleaned_dataset/dataset2_cleaned.csv", index=False)

print("\n=== Cleaning Complete ===")
print("Cleaned datasets saved to 'cleaned_dataset/' folder:")
print(" - cleaned_dataset/dataset1_cleaned.csv")
print(" - cleaned_dataset/dataset2_cleaned.csv")


=== Cleaning Dataset 1 ===
Dataset 1 cleaned. Shape: (907, 15)

=== Cleaning Dataset 2 ===
Dataset 2 cleaned. Shape: (2123, 8)

=== Cleaning Complete ===
Cleaned datasets saved to 'cleaned_dataset/' folder:
 - cleaned_dataset/dataset1_cleaned.csv
 - cleaned_dataset/dataset2_cleaned.csv
