In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
customers_file = r'C:\Users\akash\Downloads\Customers.csv'
customers_df = pd.read_csv(customers_file)

# Step 1: Initial Data Exploration
print("Dataset Overview:")
print(customers_df.head())
print("\nDataset Info:")
print(customers_df.info())
print("\nMissing Values:")
print(customers_df.isnull().sum())

# Step 2: Descriptive Statistics
print("\nDescriptive Statistics:")
print(customers_df.describe())

# Step 3: Visualizations
# Age distribution
plt.figure(figsize=(8, 5))
sns.histplot(customers_df['Age'], kde=True, bins=20, color='blue')
plt.title('Age Distribution of Customers')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Gender distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Gender', data=customers_df, palette='viridis')
plt.title('Gender Distribution of Customers')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

# Customer region analysis
plt.figure(figsize=(10, 6))
customers_df['Region'].value_counts().plot(kind='bar', color='green')
plt.title('Customer Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Number of Customers')
plt.show()

# Step 4: Business Insights
insights = [
    "1. The average age of customers is {} years, with most customers between {} and {} years old.".format(
        int(customers_df['Age'].mean()), int(customers_df['Age'].min()), int(customers_df['Age'].max())
    ),
    "2. Gender distribution shows that {}% of the customers are {} and {}% are {}.".format(
        round((customers_df['Gender'].value_counts(normalize=True) * 100)['Female'], 2), 'Female',
        round((customers_df['Gender'].value_counts(normalize=True) * 100)['Male'], 2), 'Male'
    ),
    "3. The region with the highest number of customers is '{}', accounting for {} customers.".format(
        customers_df['Region'].value_counts().idxmax(), customers_df['Region'].value_counts().max()
    ),
    "4. {} customers have missing or invalid age data, which may need further investigation.".format(
        customers_df['Age'].isnull().sum()
    ),
    "5. Analyzing the customer regions reveals potential areas for regional-specific marketing strategies."
]

print("\nBusiness Insights:")
for insight in insights:
    print(insight)


Dataset Overview:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None

Missing Values:
CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

Descriptive Statistics:
       CustomerID      CustomerName         Region  SignupDate
count         20

KeyError: 'Age'

<Figure size 800x500 with 0 Axes>

In [6]:
import pandas as pd

# Load the datasets
products_path = r'C:\Users\akash\Downloads\Products.csv'
customers_path = r'C:\Users\akash\Downloads\Customers.csv'
transactions_path = r'C:\Users\akash\Downloads\Transactions.csv'

# Read the datasets
products_df = pd.read_csv(products_path)
customers_df = pd.read_csv(customers_path)
transactions_df = pd.read_csv(transactions_path)

# Display basic information about each dataset
products_info = products_df.info(), products_df.head()
customers_info = customers_df.info(), customers_df.head()
transactions_info = transactions_df.info(), transactions_df.head()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  


In [5]:
products_info, customers_info, transactions_info

((None,
    ProductID              ProductName     Category   Price
  0      P001     ActiveWear Biography        Books  169.30
  1      P002    ActiveWear Smartwatch  Electronics  346.30
  2      P003  ComfortLiving Biography        Books   44.12
  3      P004            BookWorld Rug   Home Decor   95.69
  4      P005          TechPro T-Shirt     Clothing  429.31),
 (None,
    CustomerID        CustomerName         Region  SignupDate
  0      C0001    Lawrence Carroll  South America  2022-07-10
  1      C0002      Elizabeth Lutz           Asia  2022-02-13
  2      C0003      Michael Rivera  South America  2024-03-07
  3      C0004  Kathleen Rodriguez  South America  2022-10-09
  4      C0005         Laura Weber           Asia  2022-08-15),
 (None,
    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
  0        T00001      C0199      P067  2024-08-25 12:38:23         1   
  1        T00112      C0146      P067  2024-05-27 22:23:54         1   
  2        T00166    

# analysis

In [7]:
# Merge datasets for analysis

# Merge transactions with products
merged_df = transactions_df.merge(products_df, on="ProductID", how="left")

# Merge the resulting dataset with customers
merged_df = merged_df.merge(customers_df, on="CustomerID", how="left")

# Convert dates to datetime format for better analysis
merged_df["TransactionDate"] = pd.to_datetime(merged_df["TransactionDate"])
merged_df["SignupDate"] = pd.to_datetime(merged_df["SignupDate"])

# Check the structure and summary of the merged data
merged_summary = merged_df.info(), merged_df.head(), merged_df.describe(include="all")
merged_summary


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   TransactionID    1000 non-null   object        
 1   CustomerID       1000 non-null   object        
 2   ProductID        1000 non-null   object        
 3   TransactionDate  1000 non-null   datetime64[ns]
 4   Quantity         1000 non-null   int64         
 5   TotalValue       1000 non-null   float64       
 6   Price_x          1000 non-null   float64       
 7   ProductName      1000 non-null   object        
 8   Category         1000 non-null   object        
 9   Price_y          1000 non-null   float64       
 10  CustomerName     1000 non-null   object        
 11  Region           1000 non-null   object        
 12  SignupDate       1000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(3), int64(1), object(7)
memory usage: 101.7+ KB


(None,
   TransactionID CustomerID ProductID     TransactionDate  Quantity  \
 0        T00001      C0199      P067 2024-08-25 12:38:23         1   
 1        T00112      C0146      P067 2024-05-27 22:23:54         1   
 2        T00166      C0127      P067 2024-04-25 07:38:55         1   
 3        T00272      C0087      P067 2024-03-26 22:55:37         2   
 4        T00363      C0070      P067 2024-03-21 15:10:10         3   
 
    TotalValue  Price_x                      ProductName     Category  Price_y  \
 0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 
       CustomerName         Region SignupDate  
 0   Andrea Jenkins    

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features_clustering)

# Perform KMeans clustering with a range of cluster numbers (2 to 10) to find the best DB Index
best_db_index = float("inf")
best_k = None
best_model = None

for k in range(2, 11):  # Testing clusters from 2 to 10
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(normalized_features)
    db_index = davies_bouldin_score(normalized_features, labels)
    
    if db_index < best_db_index:
        best_db_index = db_index
        best_k = k
        best_model = kmeans

# Store the best results
best_labels = best_model.labels_
best_k, best_db_index


NameError: name 'customer_features_clustering' is not defined

In [9]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce dimensions to 2D using PCA for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(normalized_features)

# Plot the clusters
plt.figure(figsize=(10, 7))
for cluster_id in range(best_k):
    cluster_points = reduced_features[best_labels == cluster_id]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster_id + 1}", s=50)

plt.title("Customer Segmentation: Cluster Visualization", fontsize=14)
plt.xlabel("PCA Component 1", fontsize=12)
plt.ylabel("PCA Component 2", fontsize=12)
plt.legend(title="Clusters", fontsize=10)
plt.grid(alpha=0.3)
plt.show()


NameError: name 'normalized_features' is not defined