In [24]:
# --- Milestone 3 Final: Data Cleaning Notebook ---

# Step 1: Upload CSV (for Colab)
from google.colab import files
import pandas as pd

print("Please upload your CSV file...")
uploaded = files.upload()

# Get uploaded filename
filename = list(uploaded.keys())[0]

# Load dataset into df_raw
try:
    df_raw = pd.read_csv(filename, encoding='ISO-8859-1')
    print("Dataset loaded successfully!")
    print("Shape of raw data:", df_raw.shape)
except UnicodeDecodeError:
    print(f"Error: Could not decode the file {filename} with 'utf-8'. Trying 'ISO-8859-1' encoding.")
    try:
        df_raw = pd.read_csv(filename, encoding='ISO-8859-1')
        print("Dataset loaded successfully with 'ISO-8859-1' encoding!")
        print("Shape of raw data:", df_raw.shape)
    except Exception as e:
        print(f"Error: Could not load the file with 'ISO-8859-1' encoding either. Please check the file format and encoding. Original error: {e}")
except FileNotFoundError:
    print(f"Error: File {filename} not found. Please ensure the file is uploaded correctly.")
except Exception as e:
    print(f"An unexpected error occurred during file loading: {e}")


# -----------------------------
# Step 2: Cleaning Code (unchanged)
# -----------------------------

# Check if df_raw was successfully loaded before proceeding with cleaning
if 'df_raw' in locals() and not df_raw.empty:
    df = df_raw.copy()

    def _find_col(df, options):
        for c in df.columns:
            if c.lower() in [o.lower() for o in options]:
                return c
        return None

    sku_col = _find_col(df, ['StockCode', 'SKU', 'stockcode', 'sku'])
    if sku_col is None:
        raise ValueError('Could not find a SKU column. Expected one of: StockCode, SKU')

    invoice_col = _find_col(df, ['InvoiceNo', 'Invoice'])
    qty_col = _find_col(df, ['Quantity', 'Qty'])
    price_col = _find_col(df, ['UnitPrice', 'Price'])

    if qty_col is None or price_col is None:
        raise ValueError("Could not find quantity/price columns. Need 'Quantity' & 'UnitPrice' (or 'Qty'/'Price').")

    if invoice_col is not None:
        df[invoice_col] = df[invoice_col].astype(str)
        df = df[~df[invoice_col].str.startswith('C', na=False)]

    df = df[(df[qty_col] > 0) & (df[price_col] > 0)]
    df = df.dropna(subset=[sku_col]).copy()
    df['Revenue'] = df[qty_col] * df[price_col]
    print(df.shape)
    df.head(3)

    # Check missing counts
    missing_counts = df.isna().sum().sort_values(ascending=False)
    print(missing_counts[missing_counts > 0])

    # Drop rows missing CustomerID
    if 'CustomerID' in df.columns:
        # Fill missing descriptions if StockCode present but Description is NaN
        if 'Description' in df.columns and 'StockCode' in df.columns:
            df['Description'] = df['Description'].fillna('No description')
            print("Remaining NaN in Description:", df['Description'].isna().sum())

    dup_count = df.duplicated().sum()
    print('Duplicate rows:', dup_count)
    df = df.drop_duplicates()
    print('After removing duplicates:', df.shape)

    # Basic stats
    print(df[['Quantity', 'UnitPrice']].describe())

    # Remove rows with non-positive Quantity or UnitPrice
    df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
    print('After removing non-positive Quantity/UnitPrice:', df.shape)

    # Clean column names
    df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    print("Cleaned column names:", df.columns)

    # Save cleaned data
    cleaned_path = 'online_retail_cleaned.csv'
    df.to_csv(cleaned_path, index=False)
    print("Saved cleaned dataset to:", cleaned_path)

    # Final drop of missing CustomerID
    if 'customerid' in df.columns:
        df = df.dropna(subset=['customerid'])
        print('After dropping missing CustomerID:', df.shape)

else:
    print("df_raw was not loaded successfully. Cleaning steps skipped.")

Please upload your CSV file...


Saving Online Retail (1).csv to Online Retail (1) (4).csv
Dataset loaded successfully!
Shape of raw data: (541909, 8)
(530104, 9)
CustomerID    132220
dtype: int64
Remaining NaN in Description: 0
Duplicate rows: 5226
After removing duplicates: (524878, 9)
            Quantity      UnitPrice
count  524878.000000  524878.000000
mean       10.616600       3.922573
std       156.280031      36.093028
min         1.000000       0.001000
25%         1.000000       1.250000
50%         4.000000       2.080000
75%        11.000000       4.130000
max     80995.000000   13541.330000
After removing non-positive Quantity/UnitPrice: (524878, 9)
Cleaned column names: Index(['invoiceno', 'stockcode', 'description', 'quantity', 'invoicedate',
       'unitprice', 'customerid', 'country', 'revenue'],
      dtype='object')
Saved cleaned dataset to: online_retail_cleaned.csv
After dropping missing CustomerID: (392692, 9)


In [25]:
missing_counts = df.isna().sum().sort_values(ascending=False)
missing_counts[missing_counts > 0]


Unnamed: 0,0


In [26]:
# Example cleaning: Drop rows missing CustomerID (common in this dataset for incomplete transactions)
if 'CustomerID' in df.columns:
    df = df.dropna(subset=['CustomerID'])
print('After dropping missing CustomerID:', df.shape)

After dropping missing CustomerID: (392692, 9)


In [27]:
# Fill missing descriptions if StockCode is present but Description is NaN (optional strategy)
if 'description' in df.columns and 'stockcode' in df.columns:
    df['description'] = df['description'].fillna('No description')
df['description'].isna().sum()

np.int64(0)

In [28]:
dup_count = df.duplicated().sum()
print('Duplicate rows:', dup_count)
df = df.drop_duplicates()
print('After removing duplicates:', df.shape)

Duplicate rows: 0
After removing duplicates: (392692, 9)


In [29]:
# Check basic stats
df[['quantity', 'unitprice']].describe()

Unnamed: 0,quantity,unitprice
count,392692.0,392692.0
mean,13.119702,3.125914
std,180.492832,22.241836
min,1.0,0.001
25%,2.0,1.25
50%,6.0,1.95
75%,12.0,3.75
max,80995.0,8142.75


In [30]:
# Remove rows with non-positive Quantity or UnitPrice (common cleaning choice for sales analysis)
df = df[(df['quantity'] > 0) & (df['unitprice'] > 0)]
print('After removing non-positive Quantity/UnitPrice:', df.shape)

After removing non-positive Quantity/UnitPrice: (392692, 9)


In [31]:
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
df.columns

Index(['invoiceno', 'stockcode', 'description', 'quantity', 'invoicedate',
       'unitprice', 'customerid', 'country', 'revenue'],
      dtype='object')

In [32]:
cleaned_path = 'online_retail_cleaned.csv'
df.to_csv(cleaned_path, index=False)
cleaned_path

'online_retail_cleaned.csv'