In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../Dataset/online_retail_II.csv')

In [8]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB


In [11]:
df.isnull().sum()

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

In [13]:
df.shape

(1067371, 8)

In [None]:
df.dropna(subset=['Description'], inplace=True)  ## Dropping rows with null Description

In [18]:
# Step 1: Fill nulls with label "GUEST"
df['Customer ID'] = df['Customer ID'].fillna('GUEST')

# Step 2: Convert to string (so that numeric IDs don't look like floats)
df['Customer ID'] = df['Customer ID'].astype(str)

# Step 3: Convert to categorical (saves memory & makes sense semantically)
df['Customer ID'] = df['Customer ID'].astype('category')

In [None]:
# Converting Invoice to categorical
df['Invoice'] = df['Invoice'].astype('category') 

In [None]:
# Converting InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [26]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [33]:
count_invalid_quantity = len(df[df['Quantity'] <= 0])
print(f'Number of rows with invalid Quantity: {count_invalid_quantity}')

Number of rows with invalid Quantity: 20261


In [34]:
df = df[df['Quantity']
    > 0]  # Keep only rows with positive Quantity

In [36]:
count_invalid_price = len(df[df['Price'] <= 0])
print(f'Number of rows with invalid Price: {count_invalid_price}')

Number of rows with invalid Price: 1057


In [None]:
df = df[df['Price'] > 0]  # Keep only rows with positive Price

In [51]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Quantity,1041671.0,10.963448,1.0,1.0,3.0,10.0,80995.0,126.51493
InvoiceDate,1041671.0,2011-01-03 16:31:26.403269376,2009-12-01 07:45:00,2010-07-12 10:26:00,2010-12-07 15:33:00,2011-07-24 12:05:00,2011-12-09 12:50:00,
Price,1041671.0,4.077038,0.001,1.25,2.1,4.13,25111.09,51.448979


In [49]:
min_price = df['Price'].min()
print(f"The minimum price is: {min_price}")

The minimum price is: 0.001


In [46]:
outlier_quantity = len(df[df['Quantity'] > 10])
print(f'Number of rows with Quantity > 10: {outlier_quantity}')

Number of rows with Quantity > 10: 259951


Original shape: (1041671, 8)
New shape after outlier removal: (930330, 8)


In [62]:


# Create a copy of your DataFrame to store the cleaned data
df_cleaned = df.copy()

# --- Remove Outliers from 'Quantity' ---
Q1_quantity = df_cleaned['Quantity'].quantile(0.25)
Q3_quantity = df_cleaned['Quantity'].quantile(0.75)
IQR_quantity = Q3_quantity - Q1_quantity
upper_bound_quantity = Q3_quantity + 1.5 * IQR_quantity

df_cleaned = df_cleaned[df_cleaned['Quantity'] <= upper_bound_quantity]


# --- Remove Outliers from 'Price' ---
Q1_price = df_cleaned['Price'].quantile(0.25)
Q3_price = df_cleaned['Price'].quantile(0.75)
IQR_price = Q3_price - Q1_price
upper_bound_price = Q3_price + 1.5 * IQR_price

df_cleaned = df_cleaned[df_cleaned['Price'] <= upper_bound_price]


# Display the shape of the DataFrame to see the effect of the cleaning
print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape: {df_cleaned.shape}")

Original DataFrame shape: (1041671, 8)
Cleaned DataFrame shape: (866736, 8)


In [63]:
df_cleaned.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Quantity,866736.0,4.876884,1.0,1.0,3.0,8.0,23.0,4.460413
InvoiceDate,866736.0,2011-01-04 20:44:41.913108480,2009-12-01 07:45:00,2010-07-13 12:33:00,2010-12-07 18:36:00,2011-07-25 16:57:00,2011-12-09 12:50:00,
Price,866736.0,2.82503,0.001,1.25,2.1,3.75,8.7,2.040362
