## Importing all important libraries

In [70]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings("ignore")

## Load the Data

In [72]:
##loading the given data
Snapdeal_data = pd.read_csv(r"C:\Users\DELL\Documents\Snapdeal Mentormind\Online_Retail.csv", encoding="latin1")

## Understand the Data

In [74]:
##understanding the basic information about the data
print("🔹 First 5 rows of data:")
print(Snapdeal_data.head(3))

print("\n🔹 Dataset Info:")
print(Snapdeal_data.info())

print("\n🔹 Statistical Summary:")
print(Snapdeal_data.describe())

🔹 First 5 rows of data:
  InvoiceNo StockCode                         Description  Quantity  \
0    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                 WHITE METAL LANTERN         6   
2    536365    84406B      CREAM CUPID HEARTS COAT HANGER         8   

        InvoiceDate  UnitPrice  CustomerID         Country  
0  01-12-2010 08:26       2.55     17850.0  United Kingdom  
1  01-12-2010 08:26       3.39     17850.0  United Kingdom  
2  01-12-2010 08:26       2.75     17850.0  United Kingdom  

🔹 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    5

## Checking Missing Values

In [76]:
##checking missing values
# Count total missing values per column
print(Snapdeal_data.isnull().sum())

# Percentage of missing values per column
print((Snapdeal_data.isnull().mean() * 100).round(2))


InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64
InvoiceNo       0.00
StockCode       0.00
Description     0.27
Quantity        0.00
InvoiceDate     0.00
UnitPrice       0.00
CustomerID     24.93
Country         0.00
dtype: float64


## Handling Missing Values

In [78]:
## Handle missing values
# Drop rows where all values are missing
Snapdeal_data.dropna(how="all", inplace=True)

## Clean the data

In [80]:
if "CustomerID" in Snapdeal_data.columns:
    Snapdeal_data["CustomerID"].fillna("Unknown", inplace=True)

# Example: Fill missing numerical values with median
for col in Snapdeal_data.select_dtypes(include=[np.number]).columns:
    Snapdeal_data[col].fillna(Snapdeal_data[col].median(), inplace=True)

# 3.2 Remove duplicate rows
Snapdeal_data.drop_duplicates(inplace=True)

# 3.3 Correct data types (example: InvoiceDate to datetime, CustomerID to string)
if "InvoiceDate" in Snapdeal_data.columns:
    Snapdeal_data["InvoiceDate"] = pd.to_datetime(Snapdeal_data["InvoiceDate"], errors="coerce")

if "CustomerID" in Snapdeal_data.columns:
    Snapdeal_data["CustomerID"] = Snapdeal_data["CustomerID"].astype(str)

print("\n✅ Cleaning Done. Updated Info:")
print(Snapdeal_data.info())


✅ Cleaning Done. Updated Info:
<class 'pandas.core.frame.DataFrame'>
Index: 536641 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    536641 non-null  object        
 1   StockCode    536641 non-null  object        
 2   Description  535187 non-null  object        
 3   Quantity     536641 non-null  int64         
 4   InvoiceDate  230656 non-null  datetime64[ns]
 5   UnitPrice    536641 non-null  float64       
 6   CustomerID   536641 non-null  object        
 7   Country      536641 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 36.8+ MB
None


## Transform the Data

In [82]:
##Transform the Data
# ------------------------------------------

# Normalize/Standardize numerical data
num_cols = Snapdeal_data.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
Snapdeal_data[num_cols] = scaler.fit_transform(Snapdeal_data[num_cols])

# Encode categorical variables
cat_cols = Snapdeal_data.select_dtypes(include=["object"]).columns
le = LabelEncoder()
for col in cat_cols:
    Snapdeal_data[col] = le.fit_transform(Snapdeal_data[col].astype(str))

print("\n✅ Transformation Done. Sample Data:")
print(Snapdeal_data.head())



✅ Transformation Done. Sample Data:
   InvoiceNo  StockCode  Description  Quantity         InvoiceDate  UnitPrice  \
0          0       3536         3918 -0.016520 2010-01-12 08:26:00  -0.021419   
1          0       2794         3926 -0.016520 2010-01-12 08:26:00  -0.012780   
2          0       3044          913 -0.007393 2010-01-12 08:26:00  -0.019362   
3          0       2985         1910 -0.016520 2010-01-12 08:26:00  -0.012780   
4          0       2984         2911 -0.016520 2010-01-12 08:26:00  -0.012780   

   CustomerID  Country  
0        4048       36  
1        4048       36  
2        4048       36  
3        4048       36  
4        4048       36  


## Feature Engineering

In [84]:
# Feature Engineering
# ------------------------------------------

# Create new features (example: total price = Quantity * UnitPrice)
if "Quantity" in Snapdeal_data.columns and "UnitPrice" in Snapdeal_data.columns:
    Snapdeal_data["TotalPrice"] = Snapdeal_data["Quantity"] * Snapdeal_data["UnitPrice"]

# 5.2 Feature selection (remove less useful columns, example: InvoiceNo)
if "InvoiceNo" in Snapdeal_data.columns:
    Snapdeal_data.drop(columns=["InvoiceNo"], inplace=True)

print("\n✅ Feature Engineering Done. Final Dataset Preview:")
print(Snapdeal_data.head())

# Save cleaned dataset
Snapdeal_data.to_csv("Cleaned_Online_Retail.csv", index=False)
print("\n💾 Cleaned data saved as 'Cleaned_Online_Retail.csv'")


✅ Feature Engineering Done. Final Dataset Preview:
   StockCode  Description  Quantity         InvoiceDate  UnitPrice  \
0       3536         3918 -0.016520 2010-01-12 08:26:00  -0.021419   
1       2794         3926 -0.016520 2010-01-12 08:26:00  -0.012780   
2       3044          913 -0.007393 2010-01-12 08:26:00  -0.019362   
3       2985         1910 -0.016520 2010-01-12 08:26:00  -0.012780   
4       2984         2911 -0.016520 2010-01-12 08:26:00  -0.012780   

   CustomerID  Country  TotalPrice  
0        4048       36    0.000354  
1        4048       36    0.000211  
2        4048       36    0.000143  
3        4048       36    0.000211  
4        4048       36    0.000211  

💾 Cleaned data saved as 'Cleaned_Online_Retail.csv'


## Save the output

In [94]:
##Save cleaned dataset to the specified path
Snapdeal_data.to_csv(
    r"C:\Users\DELL\Documents\Snapdeal Mentormind\Cleaned_Online_Retail2.csv",
    index=False
)

# Print confirmation message
print("✅ Output is saved")

✅ Output is saved
