In [None]:
# --- Milestone 3 Final: Data Cleaning Notebook (Excel Version) ---

# Step 1: Upload Excel (for Colab)
from google.colab import files
import pandas as pd

print("Please upload your Excel file...")
uploaded = files.upload()

# Get uploaded filename
filename = list(uploaded.keys())[0]

# Load dataset into df_raw
try:
    df_raw = pd.read_excel(filename)
    print("Dataset loaded successfully!")
    print("Shape of raw data:", df_raw.shape)
except FileNotFoundError:
    print(f"Error: File {filename} not found. Please ensure the file is uploaded correctly.")
except Exception as e:
    print(f"An unexpected error occurred during file loading: {e}")

# -----------------------------
# Step 2: Cleaning Code (unchanged)
# -----------------------------
if 'df_raw' in locals() and not df_raw.empty:
    df = df_raw.copy()

    def _find_col(df, options):
        for c in df.columns:
            if c.lower() in [o.lower() for o in options]:
                return c
        return None

    sku_col = _find_col(df, ['StockCode', 'SKU', 'stockcode', 'sku'])
    if sku_col is None:
        raise ValueError('Could not find a SKU column. Expected one of: StockCode, SKU')

    invoice_col = _find_col(df, ['InvoiceNo', 'Invoice'])
    qty_col = _find_col(df, ['Quantity', 'Qty'])
    price_col = _find_col(df, ['UnitPrice', 'Price'])

    if qty_col is None or price_col is None:
        raise ValueError("Could not find quantity/price columns. Need 'Quantity' & 'UnitPrice' (or 'Qty'/'Price').")

    # Remove credit notes
    if invoice_col is not None:
        df[invoice_col] = df[invoice_col].astype(str)
        df = df[~df[invoice_col].str.startswith('C', na=False)]

    # Keep only positive quantity & price
    df = df[(df[qty_col] > 0) & (df[price_col] > 0)]

    # Drop rows missing SKU
    df = df.dropna(subset=[sku_col]).copy()

    # Compute Revenue
    df['Revenue'] = df[qty_col] * df[price_col]

    # Fill missing description if StockCode present
    if 'Description' in df.columns and 'StockCode' in df.columns:
        df['Description'] = df['Description'].fillna('No description')
        print("Remaining NaN in Description:", df['Description'].isna().sum())

    # Drop duplicates
    dup_count = df.duplicated().sum()
    print('Duplicate rows:', dup_count)
    df = df.drop_duplicates()
    print('After removing duplicates:', df.shape)

    # Stats
    print(df[['Quantity', 'UnitPrice']].describe())

    # Ensure only positive values remain
    df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
    print('After removing non-positive Quantity/UnitPrice:', df.shape)

    # Standardize column names
    df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    print("Cleaned column names:", df.columns)

    # Drop missing customer IDs
    if 'customerid' in df.columns:
        df = df.dropna(subset=['customerid'])
        print('After dropping missing CustomerID:', df.shape)

    # Save cleaned data (Excel instead of CSV)
    cleaned_path = 'online_retail_cleaned.xlsx'
    df.to_excel(cleaned_path, index=False)
    print("Saved cleaned dataset to:", cleaned_path)

    files.download(cleaned_path)
else:
    print("df_raw was not loaded successfully. Cleaning steps skipped.")


Please upload your Excel file...


Saving Online Retail (2).xlsx to Online Retail (2).xlsx
Dataset loaded successfully!
Shape of raw data: (541909, 8)
Remaining NaN in Description: 0
Duplicate rows: 5226
After removing duplicates: (524878, 9)
            Quantity      UnitPrice
count  524878.000000  524878.000000
mean       10.616600       3.922573
std       156.280031      36.093028
min         1.000000       0.001000
25%         1.000000       1.250000
50%         4.000000       2.080000
75%        11.000000       4.130000
max     80995.000000   13541.330000
After removing non-positive Quantity/UnitPrice: (524878, 9)
Cleaned column names: Index(['invoiceno', 'stockcode', 'description', 'quantity', 'invoicedate',
       'unitprice', 'customerid', 'country', 'revenue'],
      dtype='object')
After dropping missing CustomerID: (392692, 9)
Saved cleaned dataset to: online_retail_cleaned.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>