#  #1 Data Understanding

#### In this notebook , I will:
- Load the retail sales dataset
- Explore its structure
- Check for missing values
- Look for duplicates
- Summarize basic statistics


In [5]:
# 1.Import Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Displaying settings for bettwe readability
pd.set_option('display.max_columns', None)

In [6]:
# 2. Import Dataset
df = pd.read_csv(r"C:\Users\ASUS\Desktop\FINAL PROJECT FOR PLACEMENT\Retail_EDA_project\data\retail_sales.csv")
df.head()


Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,price,payment_method,city,review_score,gender,age
0,13542,2024-12-17,784,10,Electronics,Smartphone,2,373.36,Credit Card,New Oliviaberg,1.0,F,56
1,23188,2024-06-01,682,50,Sports & Outdoors,Soccer Ball,5,299.34,Credit Card,Port Matthew,,M,59
2,55098,2025-02-04,684,50,Sports & Outdoors,Tent,5,23.0,Credit Card,West Sarah,5.0,F,64
3,65208,2024-10-28,204,40,Books & Stationery,Story Book,2,230.11,Bank Transfer,Hernandezburgh,5.0,M,34
4,63872,2024-05-10,202,20,Fashion,Skirt,4,176.72,Credit Card,Jenkinshaven,1.0,F,33


In [7]:
# 3. Dataset Shape

print('Dataset shape:', df.shape)


Dataset shape: (1000, 13)


In [8]:
# 4.Column Information and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customer_id     1000 non-null   int64  
 1   order_date      1000 non-null   object 
 2   product_id      1000 non-null   int64  
 3   category_id     1000 non-null   int64  
 4   category_name   1000 non-null   object 
 5   product_name    1000 non-null   object 
 6   quantity        1000 non-null   int64  
 7   price           1000 non-null   float64
 8   payment_method  1000 non-null   object 
 9   city            1000 non-null   object 
 10  review_score    799 non-null    float64
 11  gender          897 non-null    object 
 12  age             1000 non-null   int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 101.7+ KB


In [9]:
# 5 Missing Values
df.isnull().sum()

customer_id         0
order_date          0
product_id          0
category_id         0
category_name       0
product_name        0
quantity            0
price               0
payment_method      0
city                0
review_score      201
gender            103
age                 0
dtype: int64

In [10]:
#  6. Duplicate Rows
print("Duplicate Rows :- ", df.duplicated().sum())

Duplicate Rows :-  0


In [11]:
# 7. Statistical Summary
df.describe()

Unnamed: 0,customer_id,product_id,category_id,quantity,price,review_score,age
count,1000.0,1000.0,1000.0,1000.0,1000.0,799.0,1000.0
mean,55490.723,540.726,30.03,2.947,251.85066,3.992491,46.382
std,25910.185857,261.737704,14.370303,1.413573,139.194688,1.239469,16.569992
min,10201.0,100.0,10.0,1.0,10.72,1.0,18.0
25%,33857.0,311.75,20.0,2.0,128.525,3.0,32.0
50%,54619.5,542.5,30.0,3.0,250.22,4.0,47.0
75%,77848.5,770.75,40.0,4.0,366.4675,5.0,61.0
max,99923.0,995.0,50.0,5.0,499.5,5.0,75.0


In [12]:
# 8. Unique Value Counts (Categorical columns)
categorical_cols=['category_name','payment_method','city','gender']

for col in categorical_cols:
    print(f'{col} -> {df[col].nunique()} unique values')
    print(df[col].unique()[:10],"\n")

#  It shows first 10 unique values of each categorical column

category_name -> 5 unique values
['Electronics' 'Sports & Outdoors' 'Books & Stationery' 'Fashion'
 'Home & Living'] 

payment_method -> 3 unique values
['Credit Card' 'Bank Transfer' 'Cash on Delivery'] 

city -> 962 unique values
['New Oliviaberg' 'Port Matthew' 'West Sarah' 'Hernandezburgh'
 'Jenkinshaven' 'East Tonyaberg' 'North Jessicabury' 'Aliciaberg'
 'West Larrymouth' 'Lake Ian'] 

gender -> 2 unique values
['F' 'M' nan] 



In [13]:
# 9. Quick Data Preview

# Fist 5 rows of the dataset
print("First 5 rows of the dataset")
display(df.head())

# Last 5 rows of the dataset
print("Last 5 rows of the dataset")
display(df.tail())

First 5 rows of the dataset


Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,price,payment_method,city,review_score,gender,age
0,13542,2024-12-17,784,10,Electronics,Smartphone,2,373.36,Credit Card,New Oliviaberg,1.0,F,56
1,23188,2024-06-01,682,50,Sports & Outdoors,Soccer Ball,5,299.34,Credit Card,Port Matthew,,M,59
2,55098,2025-02-04,684,50,Sports & Outdoors,Tent,5,23.0,Credit Card,West Sarah,5.0,F,64
3,65208,2024-10-28,204,40,Books & Stationery,Story Book,2,230.11,Bank Transfer,Hernandezburgh,5.0,M,34
4,63872,2024-05-10,202,20,Fashion,Skirt,4,176.72,Credit Card,Jenkinshaven,1.0,F,33


Last 5 rows of the dataset


Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,price,payment_method,city,review_score,gender,age
995,67967,2024-05-04,965,40,Books & Stationery,Notebook,3,495.24,Cash on Delivery,Hodgemouth,,,30
996,99828,2024-09-12,510,40,Books & Stationery,Story Book,5,427.73,Credit Card,Douglastown,3.0,F,72
997,92290,2024-11-06,445,10,Electronics,Smartphone,5,354.64,Bank Transfer,New Amberville,,M,49
998,61427,2024-09-17,410,10,Electronics,Laptop,4,221.54,Cash on Delivery,New Sean,3.0,M,71
999,20658,2024-11-06,177,40,Books & Stationery,Pen,3,196.97,Cash on Delivery,North Kelsey,1.0,M,34
