# 5. Data Imputation in the Retail Sales Dataset

Importing the libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer

Loading the dataset

In [2]:
df = pd.read_csv('datasets/retail_sales_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


Converting 'Date'

In [9]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [10]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df = df.drop(columns=['Date'])

Encode categorical columns

In [11]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Product Category'] = label_encoder.fit_transform(df['Product Category'])

Dropping 'Customer ID' as it is not useful

In [12]:
df = df.drop(columns=['Customer ID'])

In [13]:
df.head()

Unnamed: 0,Transaction ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount,Year,Month,Day
0,1,1,34,0,3,50,150,2023,11,24
1,2,0,26,1,2,500,1000,2023,2,27
2,3,1,50,2,1,30,30,2023,1,13
3,4,1,37,1,1,500,500,2023,5,21
4,5,1,30,0,2,50,100,2023,5,6


# Applying KNN Imputation

In [14]:
knn_imputer = KNNImputer(n_neighbors=5)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

In [15]:
df_knn_imputed.head()

Unnamed: 0,Transaction ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount,Year,Month,Day
0,1.0,1.0,34.0,0.0,3.0,50.0,150.0,2023.0,11.0,24.0
1,2.0,0.0,26.0,1.0,2.0,500.0,1000.0,2023.0,2.0,27.0
2,3.0,1.0,50.0,2.0,1.0,30.0,30.0,2023.0,1.0,13.0
3,4.0,1.0,37.0,1.0,1.0,500.0,500.0,2023.0,5.0,21.0
4,5.0,1.0,30.0,0.0,2.0,50.0,100.0,2023.0,5.0,6.0


# Applying MICE Imputation

In [16]:
mice_imputer = IterativeImputer()
df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df), columns=df.columns)

In [17]:
df_mice_imputed.head()

Unnamed: 0,Transaction ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount,Year,Month,Day
0,1.0,1.0,34.0,0.0,3.0,50.0,150.0,2023.0,11.0,24.0
1,2.0,0.0,26.0,1.0,2.0,500.0,1000.0,2023.0,2.0,27.0
2,3.0,1.0,50.0,2.0,1.0,30.0,30.0,2023.0,1.0,13.0
3,4.0,1.0,37.0,1.0,1.0,500.0,500.0,2023.0,5.0,21.0
4,5.0,1.0,30.0,0.0,2.0,50.0,100.0,2023.0,5.0,6.0
