In [None]:
# --- Online Retail (UCI) – Initial EDA by Aidan (Excel-friendly) ---

# Imports
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# For optional Colab upload
try:
    from google.colab import files  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)

# ==============================
# Data Import (matches FinalAnalysis)
# ==============================

DEFAULT_NAME = "online_retail_sample_10k_clean.xlsx"  # changed default to Excel

def _read_any(path_or_buf, filename_hint=""):
    """Read CSV/CSV.GZ/Excel into a DataFrame."""
    name = (filename_hint or (str(path_or_buf) if isinstance(path_or_buf, str) else ""))
    lower = name.lower()
    if lower.endswith((".xlsx", ".xls")):
        return pd.read_excel(path_or_buf)
    comp = "gzip" if lower.endswith(".gz") else "infer"
    return pd.read_csv(path_or_buf, compression=comp, low_memory=False, encoding="latin1")

# ==== PROMPT & LOAD ====
if os.path.exists(DEFAULT_NAME):
    print(f"Found {DEFAULT_NAME} in the working directory. Loading it…")
    df_raw = _read_any(DEFAULT_NAME, filename_hint=DEFAULT_NAME)

elif IN_COLAB:
    print("Please upload your CSV / CSV.GZ / XLSX file…")
    try:
        df_raw
        print("Using previously uploaded df_raw.")
    except NameError:
        uploaded = files.upload()
        if not uploaded:
            raise SystemExit("No file uploaded.")
        name, data = next(iter(uploaded.items()))
        buf = io.BytesIO(data)
        df_raw = _read_any(buf, filename_hint=name)
        print(f"Loaded: {name}  -> shape={df_raw.shape}")

else:
    # Local Jupyter: file dialog with Excel option
    path = ""
    try:
        import tkinter as tk
        from tkinter import filedialog
        tk.Tk().withdraw()
        path = filedialog.askopenfilename(
            title="Select CSV/CSV.GZ or Excel file",
            filetypes=[("CSV", "*.csv"), ("Compressed CSV", "*.csv.gz"),
                       ("Excel", "*.xlsx *.xls"), ("All files", "*.*")]
        )
    except Exception:
        pass
    if not path:
        path = input("Enter path to your CSV/CSV.GZ or Excel file: ").strip()
    if not path:
        raise SystemExit("No file selected.")
    df_raw = _read_any(path, filename_hint=os.path.basename(path))
    print(f"Loaded: {os.path.basename(path)}  -> shape={df_raw.shape}")

print("Initial shape:", df_raw.shape)

# ==============================
# Cleaning Steps
# ==============================
df = df_raw.copy()

def _find_col(df, options):
    for c in df.columns:
        if c.lower() in [o.lower() for o in options]:
            return c
    return None

sku_col = _find_col(df, ['StockCode', 'SKU', 'stockcode', 'sku'])
if sku_col is None:
    raise ValueError('Could not find a SKU column. Expected one of: StockCode, SKU')

invoice_col = _find_col(df, ['InvoiceNo', 'Invoice'])
qty_col     = _find_col(df, ['Quantity', 'Qty'])
price_col   = _find_col(df, ['UnitPrice', 'Price'])

if qty_col is None or price_col is None:
    raise ValueError("Could not find quantity/price columns. Need 'Quantity' & 'UnitPrice' (or 'Qty'/'Price').")

# Remove credit notes
if invoice_col is not None:
    df[invoice_col] = df[invoice_col].astype(str)
    df = df[~df[invoice_col].str.startswith('C', na=False)]

# Keep only positive quantity & price
df = df[(df[qty_col] > 0) & (df[price_col] > 0)]

# Drop rows missing SKU
df = df.dropna(subset=[sku_col]).copy()

# Compute Revenue
df['Revenue'] = df[qty_col] * df[price_col]

print("Shape after initial cleaning:", df.shape)
display(df.head(3))

# ==============================
# Initial Exploratory Analysis (EDA)
# ==============================
print("Shape:", df.shape)
print("Columns:", list(df.columns))

print("\nDTypes:")
print(df.dtypes)

print("\nInfo:")
print(df.info())

display(df.head(10))
display(df.tail(5))
display(df.sample(min(5, len(df))))

# Descriptive statistics
display(df.describe(include=[np.number]).T)
display(df.describe(include=[object]).T)

# ==============================
# Optional Cleaning
# ==============================
if 'CustomerID' in df.columns:
    df = df.dropna(subset=['CustomerID'])
    print('After dropping missing CustomerID:', df.shape)

if 'Description' in df.columns and 'StockCode' in df.columns:
    df['Description'] = df['Description'].fillna('No description')
    print("Remaining NaN in Description:", df['Description'].isna().sum())

# Standardize column names
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
print("Standardized column names:", df.columns)

# ==============================
# Save cleaned data (Excel)
# ==============================
cleaned_path = "online_retail_cleaned.xlsx"
df.to_excel(cleaned_path, index=False)
print("Saved cleaned dataset to:", cleaned_path)

if IN_COLAB:
    files.download(cleaned_path)


Please upload your CSV / CSV.GZ / XLSX file…


Saving Online Retail (2).xlsx to Online Retail (2).xlsx
Loaded: Online Retail (2).xlsx  -> shape=(541909, 8)
Initial shape: (541909, 8)
Shape after initial cleaning: (530104, 9)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0


Shape: (530104, 9)
Columns: ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'Revenue']

DTypes:
InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
Revenue               float64
dtype: object

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 530104 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    530104 non-null  object        
 1   StockCode    530104 non-null  object        
 2   Description  530104 non-null  object        
 3   Quantity     530104 non-null  int64         
 4   InvoiceDate  530104 non-null  datetime64[ns]
 5   UnitPrice    530104 non-null  float64       
 6   CustomerID   397884 non-null  float64       

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom,15.3
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom,25.5
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom,11.1
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom,11.1
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom,54.08


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,10.2
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France,12.6
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,16.6
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,16.6
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,14.85


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
519340,580143,23235,STORAGE TIN VINTAGE LEAF,6,2011-12-02 09:32:00,2.89,13297.0,United Kingdom,17.34
292425,562553,22169,FAMILY ALBUM WHITE PICTURE FRAME,1,2011-08-05 16:34:00,16.63,,United Kingdom,16.63
296815,562932,22467,GUMBALL COAT RACK,2,2011-08-10 16:39:00,2.55,16904.0,United Kingdom,5.1
36474,539451,20717,STRAWBERRY SHOPPER BAG,1,2010-12-17 16:59:00,2.51,,United Kingdom,2.51
190664,553203,C2,CARRIAGE,1,2011-05-15 16:10:00,50.0,14911.0,EIRE,50.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,530104.0,10.542037,155.524124,1.0,1.0,3.0,10.0,80995.0
UnitPrice,530104.0,3.907625,35.915681,0.001,1.25,2.08,4.13,13541.33
CustomerID,397884.0,15294.423453,1713.14156,12346.0,13969.0,15159.0,16795.0,18287.0
Revenue,530104.0,20.121871,270.356743,0.001,3.75,9.9,17.7,168469.6


Unnamed: 0,count,unique,top,freq
InvoiceNo,530104,19960,573585,1114
StockCode,530104,3922,85123A,2265
Description,530104,4026,WHITE HANGING HEART T-LIGHT HOLDER,2323
Country,530104,38,United Kingdom,485123


After dropping missing CustomerID: (397884, 9)
Remaining NaN in Description: 0
Standardized column names: Index(['invoiceno', 'stockcode', 'description', 'quantity', 'invoicedate',
       'unitprice', 'customerid', 'country', 'revenue'],
      dtype='object')
Saved cleaned dataset to: online_retail_cleaned.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>