In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"\online_retail_II.csv")

In [3]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [4]:
df.dtypes

Invoice         object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
Price          float64
Customer ID    float64
Country         object
dtype: object

In [5]:
df.isnull().sum()

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

In [6]:
# STANDARDIZATION OF COLUMNS
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(' ', '_')
              .str.replace('-', '_')
)

df.columns


Index(['invoice', 'stockcode', 'description', 'quantity', 'invoicedate',
       'price', 'customer_id', 'country'],
      dtype='object')

In [7]:
#Removing completely blank rows
df = df.dropna(how='all')
df.shape

(1067371, 8)

In [8]:
df.isnull().sum()


invoice             0
stockcode           0
description      4382
quantity            0
invoicedate         0
price               0
customer_id    243007
country             0
dtype: int64

In [9]:
#invoice datetime 

df['invoicedate'] = pd.to_datetime(
    df['invoicedate'],
    dayfirst=True,
    errors='coerce'
)


In [10]:
#handling missing customer_id
df['customer_id'] = df['customer_id'].astype('float')

df['customer_id'] = df['customer_id'].fillna(-1)     # temporary marker
df['customer_id'] = df['customer_id'].astype(int)    # convert to int

# Replace -1 with a string for better clarity
df['customer_id'] = df['customer_id'].replace(-1, "Unknown")


In [11]:
df.isnull().sum()

invoice           0
stockcode         0
description    4382
quantity          0
invoicedate       0
price             0
customer_id       0
country           0
dtype: int64

In [13]:
#  Tag leakage types
df['is_free_item'] = df['price'] == 0
df['is_ghost_item'] = df['quantity'] == 0
df['is_return'] = df['quantity'] < 0
df['is_cancel'] = df['invoice'].astype(str).str.startswith('C')

In [14]:
df.shape

(1067371, 12)

In [15]:
df['price'].max()

38970.0

In [16]:
df['quantity'].max()

80995

In [17]:
# Find mapping of stockcode -> most common description
desc_map = (
    df.groupby('stockcode')['description']
      .agg(lambda x: x.dropna().mode().iloc[0] if not x.dropna().empty else None)
)

# Fill missing descriptions from this mapping
df['description'] = df.apply(
    lambda row: desc_map[row['stockcode']] if pd.isna(row['description']) or row['description']=="nan" else row['description'],
    axis=1
)


In [18]:
df.isnull().sum()

invoice            0
stockcode          0
description      363
quantity           0
invoicedate        0
price              0
customer_id        0
country            0
is_free_item       0
is_ghost_item      0
is_return          0
is_cancel          0
dtype: int64

In [23]:
df['description'] = df['description'].fillna("MISSING DESCRIPTION")
df.isnull().sum()

invoice          0
stockcode        0
description      0
quantity         0
invoicedate      0
price            0
customer_id      0
country          0
is_free_item     0
is_ghost_item    0
is_return        0
is_cancel        0
total_price      0
dtype: int64

In [24]:
df['total_price'] = df['quantity'] * df['price']

In [25]:
df.head()

Unnamed: 0,invoice,stockcode,description,quantity,invoicedate,price,customer_id,country,is_free_item,is_ghost_item,is_return,is_cancel,total_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom,False,False,False,False,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,False,False,False,False,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,False,False,False,False,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom,False,False,False,False,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom,False,False,False,False,30.0


In [26]:
df.isna().sum()
df['description'] = (
    df['description']
    .astype(str)
    .str.strip()
    .str.upper()
)
df.head()

Unnamed: 0,invoice,stockcode,description,quantity,invoicedate,price,customer_id,country,is_free_item,is_ghost_item,is_return,is_cancel,total_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom,False,False,False,False,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,False,False,False,False,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,False,False,False,False,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom,False,False,False,False,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom,False,False,False,False,30.0


In [27]:
df.to_csv(r"online_retail_II.csv", index=False)
print("Saved cleaned file!")

Saved cleaned file!
