In [None]:
# Customer Segmentation Dashboard: Phase 2 - Data Cleaning and Preprocessing

**Objective**: Clean and preprocess the e-commerce data for RFM analysis.

**Steps**:
1. Load data from SQLite.
2. Remove duplicates.
3. Handle missing values.
4. Convert dates to datetime.
5. Calculate total spend (Quantity * UnitPrice).
6. Save cleaned data.

In [1]:
import pandas as pd
import sqlite3

# Connect to SQLite database
db_path = '../ecommerce_data.db'
conn = sqlite3.connect(db_path)

# Load data
df = pd.read_sql_query('SELECT * FROM transactions', conn)
print("Loaded data shape:", df.shape)
df.head()

Loaded data shape: (541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [None]:
## Step 1: Remove Duplicates

In [2]:
# Remove duplicate rows
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f'Removed {before - after} duplicate rows.')
df.head()

Removed 5268 duplicate rows.


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [None]:
## Step 2: Handle Missing Values

In [3]:
# Check missing values
missing = df.isnull().sum()
print('Missing values per column:')
print(missing)

# Drop rows with missing CustomerID (required for RFM)
before = df.shape[0]
df = df.dropna(subset=['CustomerID'])
after = df.shape[0]
print(f'Removed {before - after} rows with missing CustomerID.')

Missing values per column:
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135037
Country             0
dtype: int64
Removed 135037 rows with missing CustomerID.


In [None]:
## Step 3: Convert Dates to Datetime

In [4]:
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [None]:
## Step 4: Calculate Total Spend

In [5]:
# Calculate total spend per row
df['TotalSpend'] = df['Quantity'] * df['UnitPrice']
df[['Quantity', 'UnitPrice', 'TotalSpend']].head()

Unnamed: 0,Quantity,UnitPrice,TotalSpend
0,6,2.55,15.3
1,6,3.39,20.34
2,8,2.75,22.0
3,6,3.39,20.34
4,6,3.39,20.34


In [None]:
## Step 5: Save Cleaned Data

Save the cleaned DataFrame to a new table in SQLite and as a CSV for transparency.

In [6]:
# Save cleaned data to SQLite
df.to_sql('transactions_clean', conn, if_exists='replace', index=False)

# Save to CSV
df.to_csv('../data/transactions_clean.csv', index=False)
print('Cleaned data saved to SQLite and CSV.')

# Close connection
conn.close()

Cleaned data saved to SQLite and CSV.


In [None]:
## Summary

- Duplicates removed.
- Missing CustomerID rows dropped.
- InvoiceDate converted to datetime.
- TotalSpend calculated.
- Cleaned data saved for next phase.