In [None]:
import numpy as np
import pandas as pd

In [None]:
def build_dim_users(df_users):
    df = df_users.copy()

    # Align IDs with bookings (201.. instead of 1..)
    df["user_id"] = df["id"] + 200

    dim_users = pd.DataFrame(
        {
            "user_id": df["user_id"],
            "name": df.get("name"),
            "username": df.get("username"),
            "email": df.get("email"),
            "phone": df.get("phone"),
            "website": df.get("website"),
            "city": df.get("address.city"),
            "company": df.get("company.name"),
        }
    )

    # Drop potential duplicates on user_id to be safe
    dim_users = dim_users.drop_duplicates(subset=["user_id"])

    return dim_users

In [None]:
def build_dim_grounds(df_grounds):

    df = df_grounds.copy()

    dim_grounds = pd.DataFrame(
        {
            "ground_id": df["ground_id"],
            "ground_name": df["ground_name"],
            "location": df["city"],
            "ground_type": df["location_type"],
            "price_per_hour": df["price_per_hour"],
            "is_active": True,
        }
    )

    dim_grounds = dim_grounds.drop_duplicates(subset=["ground_id"])

    return dim_grounds

In [None]:
def build_dim_date(df_bookings):

    dates = pd.to_datetime(df_bookings["booking_date"], errors="coerce").dropna().drop_duplicates()
    dates = dates.sort_values()

    dim_date = pd.DataFrame({"full_date": dates})

    dim_date["date_id"] = range(1, len(dim_date) + 1)
    dim_date["day"] = dim_date["full_date"].dt.day
    dim_date["month"] = dim_date["full_date"].dt.month
    dim_date["year"] = dim_date["full_date"].dt.year
    dim_date["quarter"] = dim_date["full_date"].dt.quarter
    dim_date["weekday"] = dim_date["full_date"].dt.weekday + 1  # 1=Monday
    dim_date["weekday_name"] = dim_date["full_date"].dt.day_name()

    # Reorder columns to match schema more closely
    dim_date = dim_date[
        [
            "date_id",
            "full_date",
            "day",
            "month",
            "year",
            "quarter",
            "weekday",
            "weekday_name",
        ]
    ]

    return dim_date

In [None]:
def build_fact_bookings(df_bookings,dim_date):
 
    df = df_bookings.copy()

    # Ensure booking_date is datetime for joining
    df["booking_date"] = pd.to_datetime(df["booking_date"], errors="coerce")

    # Join to get date_id from dim_date
    df = df.merge(dim_date[["date_id", "full_date"]], left_on="booking_date", right_on="full_date", how="left")

    fact_bookings = pd.DataFrame(
        {
            "booking_id": df["booking_id"],
            "user_id": df["user_id"],
            "ground_id": df["ground_id"],
            "date_id": df["date_id"],
            "booking_date": df["booking_date"].dt.date,
            "slot_time": "Unknown",
            "duration_hours": df["hours"],
            "total_price": df["total_price"],
            "booking_status": "Completed",
        }
    )

    return fact_bookings

In [None]:
#fill numerical data
def fill_numeric(df):
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    return df

In [None]:
#fill categorical data
def fill_categorical(df):
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode().iloc[0] if not df[col].mode().empty else 'Unknown')
    return df

In [None]:
#remove duplicates
def remove_duplicates(df):
    df = df.drop_duplicates()
    return df

In [None]:
#standerized names
def standarize_names(df):
    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
    return df

In [None]:
# fix date format
def fix_date_format(df):
    date_columns = df.select_dtypes(include=['object']).columns[df.columns.str.contains('date|Date|DATE')]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

In [None]:
# outlier detection and treatment
def detect_remove_outliers(df):
    num_cols= df.select_dtypes(include=np.number).columns
    for col in num_cols:
        Q1 = df[col].quantile(0.20)
        Q3 = df[col].quantile(0.80)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        print(f"Outliers in {col}: {len(outliers)}")
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

In [None]:
# noise reduction
def noise_reduction(df):
    
    return df

In [None]:
def clean_grounds(df_grounds):
    df_grounds = fill_numeric(df_grounds)
    df_grounds = fill_categorical(df_grounds)
    df_grounds = remove_duplicates(df_grounds)
    df_grounds = standarize_names(df_grounds)
    df_grounds = fix_date_format(df_grounds)
    df_grounds = detect_remove_outliers(df_grounds)
    return df_grounds

In [None]:
def clean_bookings(df_bookings):
    df_bookings = fill_numeric(df_bookings)
    df_bookings = fill_categorical(df_bookings)
    df_bookings = remove_duplicates(df_bookings)
    df_bookings = standarize_names(df_bookings)
    df_bookings = fix_date_format(df_bookings)
    df_bookings = detect_remove_outliers(df_bookings)
    return df_bookings

In [None]:
def clean_users(df_users):
    df_users = fill_numeric(df_users)
    df_users = fill_categorical(df_users)
    df_users = remove_duplicates(df_users)
    df_users = standarize_names(df_users)
    df_users = fix_date_format(df_users)
    df_users = detect_remove_outliers(df_users)
    return df_users

In [None]:
def transform_data(df_bookings,df_grounds,df_users):
    
    #clean bookings
    df_bookings = df_bookings.copy()
    df_bookings = clean_bookings(df_bookings)

    df_grounds = df_grounds.copy()
    df_grounds = clean_grounds(df_grounds)

    df_users = df_users.copy()
    df_users = clean_users(df_users)

    # Build dimensions table
    dim_users = build_dim_users(df_users)
    dim_grounds = build_dim_grounds(df_grounds)
    dim_date = build_dim_date(df_bookings)

    # Build fact table
    fact_bookings = build_fact_bookings(df_bookings, dim_date)

    return dim_users, dim_grounds, dim_date, fact_bookings

In [None]:
# Test
from etl.extract import extract_data

b, g, u = extract_data()
d_users, d_grounds, d_date, f_bookings = transform_data(b, g, u)
print("dim_users:", d_users.shape)
print("dim_grounds:", d_grounds.shape)
print("dim_date:", d_date.shape)
print("fact_bookings:", f_bookings.shape)