# Data audit & cleaning
This notebook performs the data audit and initial cleaning steps for the raw dataset located in `../data/raw.csv`.


In [None]:
# Imports and display settings
import pandas as pd
from pathlib import Path
pd.set_option('display.float_format', lambda v: f'{v:,.2f}')

data_path = Path('..') / 'data' / 'raw.csv'

# Load the raw dataset
df_raw = pd.read_csv(data_path)
print('Loaded data shape:', df_raw.shape)
df_raw.head()


In [None]:
# Basic structure and completeness checks
print('Data types:\n', df_raw.dtypes, '\n')
print('Row/column counts:', df_raw.shape)
print('Missing values per column:\n', df_raw.isna().sum(), '\n')
print('Numeric summary statistics:\n', df_raw.describe(include='all'))

In [None]:
# Duplicate check
duplicate_rows = df_raw.duplicated()
print('Number of duplicate rows:', duplicate_rows.sum())
if duplicate_rows.any():
    display(df_raw[duplicate_rows].head())


In [None]:
# Quick look at string-based columns to understand formatting quirks
string_cols = [c for c in df_raw.columns if df_raw[c].dtype == 'object']
for col in string_cols:
    print(f'\nSample values for {col}:')
    display(df_raw[col].dropna().head())

In [None]:
# Cleaning steps
df_clean = df_raw.copy()

# 1) Standardize column names (strip spaces, fix typos, snake_case for consistency)
df_clean.columns = [c.strip() for c in df_clean.columns]
rename_map = {
    'Date': 'date',
    'ffpi_Energy_Consumption': 'ffpi_energy_consumption',
    'Engergy Imported': 'energy_imported',
    'ffpi_USD/HKD_Rate': 'ffpi_usd_hkd_rate',
    'USD/HKD Rate': 'usd_hkd_rate',
    'rs_Dairy_Products': 'rs_dairy_products',
    'rs_Fresh': 'rs_fresh',
}
df_clean = df_clean.rename(columns=rename_map)

# 2) Trim whitespace inside string fields to remove inconsistent labels
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        df_clean[col] = df_clean[col].astype(str).str.strip().replace({'': pd.NA, 'nan': pd.NA})

# 3) Convert date column to datetime (month/day/year observed; coercing errors to NaT for transparency)
if 'date' in df_clean.columns:
    df_clean['date'] = pd.to_datetime(df_clean['date'], format='%m/%d/%Y', errors='coerce')

# 4) Convert numeric-looking object columns by removing commas
for col in ['bdi_price', 'ffpi_energy_consumption', 'energy_imported']:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col].astype(str).str.replace(',', ''), errors='coerce')

# 5) Flag potential outliers using IQR (flag only; do not drop)
numeric_cols = df_clean.select_dtypes(include='number').columns
iqr_flags = pd.Series(False, index=df_clean.index)
for col in numeric_cols:
    q1 = df_clean[col].quantile(0.25)
    q3 = df_clean[col].quantile(0.75)
    iqr = q3 - q1
    if pd.isna(iqr):
        continue
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    iqr_flags |= (df_clean[col] < lower) | (df_clean[col] > upper)
df_clean['flag_iqr_outlier'] = iqr_flags

# 6) Flag rows with any negative numeric values (indices typically non-negative; keeping rows for review)
negative_flags = (df_clean[numeric_cols] < 0).any(axis=1)
df_clean['flag_negative_values'] = negative_flags

df_clean.head()


In [None]:
# Post-cleaning sanity checks
print('Cleaned data types:\n', df_clean.dtypes, '\n')
print('Missing values after cleaning:\n', df_clean.isna().sum(), '\n')
print('Duplicate rows after cleaning:', df_clean.duplicated().sum())

# Category level counts (using the date period to understand coverage)
if 'date' in df_clean.columns:
    monthly_counts = df_clean['date'].dt.to_period('M').value_counts().sort_index()
    print('\nRecords per month (after parsing dates):')
    display(monthly_counts)

# Surface any rows flagged for outliers or negatives without dropping them
flagged = df_clean[df_clean[['flag_iqr_outlier', 'flag_negative_values']].any(axis=1)]
print('\nRows flagged for potential review (kept in dataset):', len(flagged))
display(flagged.head())

In [None]:
# Optional: save cleaned data for downstream steps
clean_path = Path('..') / 'data' / 'cleaned.csv'
df_clean.to_csv(clean_path, index=False)
print('Cleaned dataset saved to', clean_path)
