# Exploratory Data Analysis (EDA)
This notebook explores the **Customer Combined Dataset** to understand its structure, quality, 
and underlying patterns. We will generate descriptive statistics, visualize trends, and summarize findings.

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style
sns.set(style='whitegrid')

In [5]:
filename = "customer_combined_clean.csv"

if os.path.exists(filename):
    df = pd.read_csv(filename)
    print(f"✅ File '{filename}' loaded successfully!")
    display(df.head())  # preview first 5 rows
else:
    print(f"❌ File '{filename}' not found in {os.getcwd()}")

✅ File 'customer_combined_clean.csv' loaded successfully!


Unnamed: 0,CustomerID,Age,Gender,Location,IncomeLevel,SignupDate,TransactionID,TransactionDate,Amount,ProductCategory,PaymentMethod,InteractionID,InteractionDate,Platform,InteractionType,Sentiment
0,0009fdd2-ae63-45ca-8d5b-d0ea98381f7b,21,Female,Lake George,Low,2020-11-09,86cd577d-4ffd-498d-94ce-e68e6cca8865,2023-10-26,389.69,Home & Garden,Bank Transfer,26af70c2-acba-461c-95b8-8200de6b154a,2024-02-16,Instagram,Share,Positive
1,0009fdd2-ae63-45ca-8d5b-d0ea98381f7b,21,Female,Lake George,Low,2020-11-09,86cd577d-4ffd-498d-94ce-e68e6cca8865,2023-10-26,389.69,Home & Garden,Bank Transfer,4d9427f2-30fe-4298-ab66-8490e29202b5,2024-04-22,Facebook,Share,Neutral
2,000c6bbd-533a-432d-922c-ab64197e71c5,25,Male,North Oliviaton,High,2019-11-06,0dc30dbb-8109-4fa8-a7e0-f7ae108075e7,2023-05-17,500.82,Electronics,Debit Card,fff3e62c-3c78-4883-9a34-1d8aad5c1582,2023-07-11,Instagram,Like,Negative
3,000c6bbd-533a-432d-922c-ab64197e71c5,25,Male,North Oliviaton,High,2019-11-06,0dc30dbb-8109-4fa8-a7e0-f7ae108075e7,2023-05-17,500.82,Electronics,Debit Card,1b2e64c0-80f5-40ed-8d19-072eeb7f2b23,2024-03-15,Instagram,Comment,Positive
4,000c6bbd-533a-432d-922c-ab64197e71c5,25,Male,North Oliviaton,High,2019-11-06,0dc30dbb-8109-4fa8-a7e0-f7ae108075e7,2023-05-17,500.82,Electronics,Debit Card,72c0bb82-9307-4ba4-8925-a5100d3d0e56,2024-03-12,Twitter,Like,Neutral


In [6]:
# Shape of dataset
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

# Column names
print("\nColumns:", df.columns.tolist())

# Info on data types & missing values
df.info()

Rows: 5729
Columns: 16

Columns: ['CustomerID', 'Age', 'Gender', 'Location', 'IncomeLevel', 'SignupDate', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory', 'PaymentMethod', 'InteractionID', 'InteractionDate', 'Platform', 'InteractionType', 'Sentiment']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5729 entries, 0 to 5728
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       5729 non-null   object 
 1   Age              5729 non-null   int64  
 2   Gender           5729 non-null   object 
 3   Location         5729 non-null   object 
 4   IncomeLevel      5729 non-null   object 
 5   SignupDate       5517 non-null   object 
 6   TransactionID    5729 non-null   object 
 7   TransactionDate  5593 non-null   object 
 8   Amount           5729 non-null   float64
 9   ProductCategory  5729 non-null   object 
 10  PaymentMethod    5729 non-null   object 
 11  InteractionID    5729 

In [8]:
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
CustomerID           0
Age                  0
Gender               0
Location             0
IncomeLevel          0
SignupDate         212
TransactionID        0
TransactionDate    136
Amount               0
ProductCategory      0
PaymentMethod        0
InteractionID        0
InteractionDate    143
Platform             0
InteractionType      0
Sentiment            0
dtype: int64


In [9]:
# Removing rows with missing values in key date columns
df = df.dropna(subset=['SignupDate', 'TransactionDate', 'InteractionDate'])

In [10]:
# Check again
print("Missing values after removal:")
print(df[['SignupDate', 'TransactionDate', 'InteractionDate']].isnull().sum())

Missing values after removal:
SignupDate         0
TransactionDate    0
InteractionDate    0
dtype: int64


In [13]:
# Numeric and categorical stats
df.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
CustomerID,5254.0,2847.0,c71e53bd-d52e-43c7-853a-bde70ff3cf38,16.0,,,,,,,
Age,5254.0,,,,44.806814,16.160713,0.0,33.0,45.0,57.0,100.0
Gender,5254.0,2.0,Female,2668.0,,,,,,,
Location,5254.0,2576.0,Port Austin,16.0,,,,,,,
IncomeLevel,5254.0,3.0,High,2149.0,,,,,,,
SignupDate,5254.0,1437.0,2024-06-08,31.0,,,,,,,
TransactionID,5254.0,2780.0,0dc30dbb-8109-4fa8-a7e0-f7ae108075e7,1483.0,,,,,,,
TransactionDate,5254.0,715.0,2023-05-17,1500.0,,,,,,,
Amount,5254.0,,,,498.17753,239.842238,0.0,385.905,500.82,613.005,999.86
ProductCategory,5254.0,5.0,Electronics,2544.0,,,,,,,


In [14]:
# Basic statistics: mean, median, min, max
stats = df.describe().T[['mean', 'min', 'max']]
stats['median'] = df.median(numeric_only=True)
print(stats)


              mean  min     max  median
Age      44.806814  0.0  100.00   45.00
Amount  498.177530  0.0  999.86  500.82


In [18]:
# Check for 0 values in all numeric columns
zero_counts = (df.select_dtypes(include=['number']) == 0).sum()

print("Zero values in each numeric column:")
print(zero_counts)

Zero values in each numeric column:
Age       57
Amount    90
dtype: int64


In [19]:
# Check for zero counts and percentages in numeric columns to decide whether to remove
numeric_cols = df.select_dtypes(include=['number'])
zero_counts = (numeric_cols == 0).sum()
zero_percentage = (zero_counts / len(df)) * 100

zero_summary = pd.DataFrame({
    'Zero_Count': zero_counts,
    'Zero_Percentage': zero_percentage.round(2)
})

print("Zero values in numeric columns:")
print(zero_summary)

Zero values in numeric columns:
        Zero_Count  Zero_Percentage
Age             57             1.08
Amount          90             1.71


In [21]:
# Remove rows where Age or Amount = 0  
df = df[(df['Age'] > 0) & (df['Amount'] > 0)]