# Milestone 2 — Final Version (Aligned with Final Project)
This notebook updates the original Milestone 2 to exactly match the code used in the Final Project, per instructor guidance.

## Imports & Display Options

In [1]:
# Imports
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For optional Colab upload
try:
    from google.colab import files  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)


## Data Import (matches FinalAnalysis)

In [2]:
import pandas as pd
import io, os

# Detect Colab
try:
    from google.colab import files  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

DEFAULT_NAME = "online_retail_sample_10k_clean.csv"  # change if needed

def _read_any(path_or_buf, filename_hint=""):
    """Read CSV/CSV.GZ/Excel into a DataFrame."""
    name = (filename_hint or (str(path_or_buf) if isinstance(path_or_buf, str) else ""))
    lower = name.lower()
    if lower.endswith((".xlsx", ".xls")):
        return pd.read_excel(path_or_buf)
    comp = "gzip" if lower.endswith(".gz") else "infer"
    return pd.read_csv(path_or_buf, compression=comp, low_memory=False, encoding="latin1")

# ==== PROMPT & LOAD ====
if os.path.exists(DEFAULT_NAME):
    print(f"Found {DEFAULT_NAME} in the working directory. Loading it…")
    df_raw = _read_any(DEFAULT_NAME, filename_hint=DEFAULT_NAME)

elif IN_COLAB:
    print("Please upload your CSV / CSV.GZ / XLSX file…")
    # This part will not be executed automatically when running the cell
    # as the file has already been uploaded in the previous execution.
    # The df_raw variable retains its value from the previous successful execution.
    try:
        # Check if df_raw is already defined from a previous upload
        df_raw
        print("Using previously uploaded df_raw.")
    except NameError:
        # If df_raw is not defined, prompt for upload (should not happen in this scenario)
        uploaded = files.upload()
        if not uploaded:
            raise SystemExit("No file uploaded.")
        name, data = next(iter(uploaded.items()))
        buf = io.BytesIO(data)
        df_raw = _read_any(buf, filename_hint=name)
        print(f"Loaded: {name}  -> shape={df_raw.shape}")

else:
    # Local Jupyter: file dialog with fallback to manual path
    path = ""
    try:
        import tkinter as tk
        from tkinter import filedialog
        tk.Tk().withdraw()
        path = filedialog.askopenfilename(
            title="Select CSV/CSV.GZ or Excel file",
            filetypes=[("CSV", "*.csv"), ("Compressed CSV", "*.csv.gz"),
                       ("Excel", "*.xlsx *.xls"), ("All files", "*.*")]
        )
    except Exception:
        pass
    if not path:
        path = input("Enter path to your CSV/CSV.GZ or Excel file: ").strip()
    if not path:
        raise SystemExit("No file selected.")
    df_raw = _read_any(path, filename_hint=os.path.basename(path))
    print(f"Loaded: {os.path.basename(path)}  -> shape={df_raw.shape}")

print("Initial shape:", df_raw.shape)

# Start of cleaning steps
df = df_raw.copy()

def _find_col(df, options):
    for c in df.columns:
        if c.lower() in [o.lower() for o in options]:
            return c
    return None

sku_col = _find_col(df, ['StockCode', 'SKU', 'stockcode', 'sku'])
if sku_col is None:
    raise ValueError('Could not find a SKU column. Expected one of: StockCode, SKU')

invoice_col = _find_col(df, ['InvoiceNo', 'Invoice'])
qty_col     = _find_col(df, ['Quantity', 'Qty'])
price_col   = _find_col(df, ['UnitPrice', 'Price'])
if qty_col is None or price_col is None:
    raise ValueError("Could not find quantity/price columns. Need 'Quantity' & 'UnitPrice' (or 'Qty'/'Price').")

if invoice_col is not None:
    df[invoice_col] = df[invoice_col].astype(str)
    df = df[~df[invoice_col].str.startswith('C', na=False)]

df = df[(df[qty_col] > 0) & (df[price_col] > 0)]
df = df.dropna(subset=[sku_col]).copy()

df['Revenue'] = df[qty_col] * df[price_col]

print("Shape after initial cleaning:", df.shape)
display(df.head(3))

Please upload your CSV / CSV.GZ / XLSX file…


Saving Online Retail (1).csv to Online Retail (1).csv
Loaded: Online Retail (1).csv  -> shape=(541909, 8)
Initial shape: (541909, 8)
Shape after initial cleaning: (530104, 9)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,22.0


## Initial Exploratory Analysis (EDA) — rubric-aligned

In [3]:
# Basic shape & columns
try:
    print("Shape:", df.shape)
    print("Columns:", list(df.columns))
except NameError:
    print("⚠️ Define df first in the Data Loading section.")

# Types, info, samples
try:
    print("\nDTypes:")
    print(df.dtypes)
    print("\nInfo:")
    print(df.info())
    display(df.head(10))
    display(df.tail(5))
    display(df.sample(min(5, len(df))))
except Exception as e:
    print("EDA preview error:", e)

# Descriptive statistics (numeric & object separately for clarity)
try:
    display(df.describe(include=[np.number]).T)
    display(df.describe(include=[object]).T)
except Exception as e:
    print("Describe error:", e)


Shape: (530104, 9)
Columns: ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'Revenue']

DTypes:
InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
Revenue        float64
dtype: object

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 530104 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    530104 non-null  object 
 1   StockCode    530104 non-null  object 
 2   Description  530104 non-null  object 
 3   Quantity     530104 non-null  int64  
 4   InvoiceDate  530104 non-null  object 
 5   UnitPrice    530104 non-null  float64
 6   CustomerID   397884 non-null  float64
 7   Country      530104 non-null  object 
 8   Revenue      530104 non-null  float64
dtypes: float64(3), int64(1), object(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850.0,United Kingdom,15.3
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850.0,United Kingdom,25.5
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850.0,United Kingdom,11.1
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850.0,United Kingdom,11.1
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047.0,United Kingdom,54.08


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France,10.2
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.1,12680.0,France,12.6
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France,16.6
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France,16.6
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,12/9/2011 12:50,4.95,12680.0,France,14.85


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
494032,578258,21871,SAVE THE PLANET MUG,2,11/23/2011 13:01,1.65,13263.0,United Kingdom,3.3
19730,537888,85185B,PINK HORSE SOCK PUPPET,6,12/9/2010 10:04,2.95,15358.0,United Kingdom,17.7
35393,539436,22282,12 EGG HOUSE PAINTED WOOD,1,12/17/2010 14:49,25.49,,United Kingdom,25.49
406757,571828,20712,JUMBO BAG WOODLAND ANIMALS,3,10/19/2011 11:52,2.08,16440.0,United Kingdom,6.24
296436,562893,23238,SET OF 4 KNICK KNACK TINS LONDON,6,8/10/2011 12:25,4.15,17442.0,United Kingdom,24.9


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,530104.0,10.542037,155.524124,1.0,1.0,3.0,10.0,80995.0
UnitPrice,530104.0,3.907625,35.915681,0.001,1.25,2.08,4.13,13541.33
CustomerID,397884.0,15294.423453,1713.14156,12346.0,13969.0,15159.0,16795.0,18287.0
Revenue,530104.0,20.121871,270.356743,0.001,3.75,9.9,17.7,168469.6


Unnamed: 0,count,unique,top,freq
InvoiceNo,530104,19960,573585,1114
StockCode,530104,3922,85123A,2265
Description,530104,4026,WHITE HANGING HEART T-LIGHT HOLDER,2323
InvoiceDate,530104,18499,10/31/2011 14:41,1114
Country,530104,38,United Kingdom,485123


## (Optional) Cleaning: Drop missing CustomerID

In [4]:
# Example cleaning: Drop rows missing CustomerID (common in this dataset for incomplete transactions)
if 'CustomerID' in df.columns:
    df = df.dropna(subset=['CustomerID'])
print('After dropping missing CustomerID:', df.shape)

After dropping missing CustomerID: (397884, 9)


## (Optional) Cleaning: Fill missing Description

In [5]:
# Fill missing descriptions if StockCode is present but Description is NaN (optional strategy)
if 'Description' in df.columns and 'StockCode' in df.columns:
    df['Description'] = df['Description'].fillna('No description')
df['Description'].isna().sum()

np.int64(0)

## (Optional) Cleaning: Standardize Column Names

In [6]:
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
df.columns

Index(['invoiceno', 'stockcode', 'description', 'quantity', 'invoicedate',
       'unitprice', 'customerid', 'country', 'revenue'],
      dtype='object')

# Online Retail (UCI) – Initial EDA by Aidan

This notebook begins the data analysis for the **Online Retail** dataset from the UCI Machine Learning Repository.

**Dataset:** https://archive.ics.uci.edu/ml/datasets/Online+Retail  
**Direct download (Excel):** https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx


## Project kickoff: scope & questions
**Five questions to explore (later in the project):**
1. Which products generate the most revenue (top-10 SKUs)?  
2. How do sales trend over time (monthly/seasonal)?  
3. Customer segmentation via RFM (Recency, Frequency, Monetary).  
4. Which countries (outside the UK) contribute most to international revenue?  
5. What’s the relationship between unit price and quantity sold?

This notebook focuses on **initial exploratory analysis** to understand the dataset’s structure and quality.


## Setup
If you're in Google Colab, run the next cell to install the Excel engine:


## Load data
We'll load directly from the UCI URL. If you already downloaded the file locally, you can point to that path instead.


## Basic properties
Use core inspection methods to understand the dataset:
- `describe()`  
- `columns`  
- `shape`  
- `dtypes`  
- `head()`, `tail()`, `sample()`  
- `info()`


## Quick data hygiene checks
Parse dates and look at missing values and duplicates.


## Quick peeks
Some fast frequency tables and sanity checks.


## Save a working copy (optional)
Save a CSV to include with your submission or for faster reloads next time.


---
## Submission checklist (for this stage)
- Your **Jupyter Notebook** (this file) with markdown documentation and initial EDA cells executed.  
- Your **dataset file** (Excel) or the saved CSV copy.
