In [1]:
# Import Libraries

In [2]:
import pandas as pd
import numpy as np


Load  Datasets Using Pandas

In [4]:
# Load main dataset
raw_data = pd.read_csv('data/raw_data.csv', encoding='ISO-8859-1')


Convert InvoiceDate to Date Format 


In [5]:
# Convert date column to datetime format
raw_data['InvoiceDate'] = pd.to_datetime(raw_data['InvoiceDate'], errors='coerce')


Create Incremental Subset (Recent Month or Random Sample)

In [6]:
# Get the latest month from the dataset
latest_month = raw_data['InvoiceDate'].dt.to_period('M').max()
incremental_data = raw_data[raw_data['InvoiceDate'].dt.to_period('M') == latest_month]


In [7]:
# Save to the data folder
incremental_data.to_csv('data/incremental_data.csv', index=False)
print("Incremental dataset saved with shape:", incremental_data.shape)


Incremental dataset saved with shape: (25525, 8)


In [8]:
print("RAW DATA HEAD:")
print(raw_data.head())

print("\nRAW DATA INFO:")
raw_data.info()

print("\nRAW DATA DESCRIPTION:")
print(raw_data.describe())

print("\nINCREMENTAL DATA HEAD:")
print(incremental_data.head())

print("\nINCREMENTAL DATA INFO:")
incremental_data.info()

print("\nINCREMENTAL DATA DESCRIPTION:")
print(incremental_data.describe())


RAW DATA HEAD:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  

RAW DATA INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype        

- This helps you understand what’s inside each dataset (column names, data types, missing values, etc.).

3. Identify and discuss at least three data quality issues

In [9]:
# Missing values
print("\nMissing values per column (Raw):")
print(raw_data.isnull().sum())

# Duplicates
print("\nNumber of duplicates (Raw):", raw_data.duplicated().sum())

# Data types
print("\nData types:")
print(raw_data.dtypes)



Missing values per column (Raw):
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

Number of duplicates (Raw): 5268

Data types:
InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object


Typical issues we had are ;
- Missing values — in columns like CustomerID or Description.
- Duplicate rows — identical transactions repeated.
- Inconsistent types — InvoiceDate stored as object instead of datetime.

In [10]:
 #  4. Merge datasets (append new records)

# Merge both datasets
combined_data = pd.concat([raw_data, incremental_data], ignore_index=True)

# Remove duplicates after merge
combined_data.drop_duplicates(inplace=True)

print("Combined dataset shape:", combined_data.shape)

Combined dataset shape: (536641, 8)


- We merge the incremental data into the raw dataset to have the most complete version.
- Dropping duplicates ensures we don’t double-count transactions.

In [11]:
# 5. Save validated copies to /data/
raw_data.to_csv('data/raw_validated.csv', index=False)
combined_data.to_csv('data/combined_validated.csv', index=False)

print("✅ Validated datasets saved successfully.")


✅ Validated datasets saved successfully.


- Both cleaned and combined datasets are now saved and ready for the transformation phase.