#### Imports and Data Loading

In [1]:
import pandas as pd
import numpy as np

# Load the dataset generated in Phase 1
df = pd.read_csv('raw_retail_data.csv')

print("Data Loaded Successfully.")
print(f"Initial Row Count: {len(df)}")
# Quick check for the irregularities we injected
print(f"Missing Values:\n{df.isnull().sum()}")

Data Loaded Successfully.
Initial Row Count: 5100
Missing Values:
Transaction_ID       0
Date                 0
Product_Name         0
Stock_On_Hand        0
Sales_Qty            0
Unit_Price_AED       0
Event             1092
Store_Location       0
dtype: int64


#### Forensic Audit Initialization

In [2]:
audit_log = {}

def log_issue(description, count):
    audit_log[description] = count
    print(f"CHECK: Found {count} {description}")

print("Audit engine initialized.")

Audit engine initialized.


#### Independent Identification of Discrepancies

In [3]:
# 1. Check for Duplicate Transactions
dupes = df.duplicated(subset=['Transaction_ID']).sum()
log_issue("Duplicate Transactions", dupes)

# 2. Check for Negative Stock (The 'Impossible' Data)
neg_stock = len(df[df['Stock_On_Hand'] < 0])
log_issue("Negative Stock Records", neg_stock)

# 3. Check for Null Events
null_ev = df['Event'].isnull().sum()
log_issue("Records with Missing Event Labels", null_ev)

# 4. Check for Sales Outliers (Statistical Discrepancy)
# We use the Interquartile Range (IQR) to find anomalies
Q3 = df['Sales_Qty'].quantile(0.75)
Q1 = df['Sales_Qty'].quantile(0.25)
IQR = Q3 - Q1
upper_bound = Q3 + (3 * IQR) # Strict outlier threshold
outliers = len(df[df['Sales_Qty'] > upper_bound])
log_issue("Extreme Sales Outliers", outliers)

CHECK: Found 100 Duplicate Transactions
CHECK: Found 51 Negative Stock Records
CHECK: Found 1092 Records with Missing Event Labels
CHECK: Found 5 Extreme Sales Outliers


#### Executing the Clean

In [4]:
# Fix Duplicates
df = df.drop_duplicates(subset=['Transaction_ID'])

# Fix Negative Stock (Assume 0 for safety)
df.loc[df['Stock_On_Hand'] < 0, 'Stock_On_Hand'] = 0

# Fix Missing Events
df['Event'] = df['Event'].fillna('None')

# Cap Outliers (To prevent skewing the KPI dashboards)
df.loc[df['Sales_Qty'] > upper_bound, 'Sales_Qty'] = upper_bound

print("Data remediation complete. Dataset is now structurally sound.")

Data remediation complete. Dataset is now structurally sound.


  df.loc[df['Sales_Qty'] > upper_bound, 'Sales_Qty'] = upper_bound


#### Export Clean Data

In [5]:
df.to_csv('cleaned_retail_data.csv', index=False)
print("Cleaned data saved as 'cleaned_retail_data.csv'.")

Cleaned data saved as 'cleaned_retail_data.csv'.
