<a href="https://colab.research.google.com/github/akshaykumar2972/POC2025/blob/main/Processing_data_for_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('df_Clustering.csv')

## **Clustering**

#### **Dropping all columns which doesn't add any value for further and created for understanding the dataset**

In [None]:
df.drop(columns=['CustomerId'
                , 'EnrolledDate'
                , 'Year_Birth'
                , 'Age_Group'
                , 'TotalAmountSpent'
                , 'TotalPurchasesCount'
                , 'AverageSpendPerPurchase'
                , 'Recency'
                , 'Frequency'
                , 'Monetary'
                , 'NumberOfDaysAsCustomer'
                , 'Age_Group'
                , 'OfferAccepted'
                , 'IncomeSegment'], inplace=True, axis=1)

In [None]:
df.iloc[:5, :15]

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,DaysSinceLastPurchase,AmountSpentOnWine,AmountSpentOnFruits,AmountSpentOnMeatProducts,AmountSpentOnFishProducts,AmountSpentOnSweetProducts,AmountSpentOnGoldProducts,PurchasesMadeWithDiscounts,PurchasesMadeViaWeb,PurchasesMadeViaCatalog
0,PhD,Married,55158.0,1,1,72,293,0,87,4,11,23,4,7,2
1,2n Cycle,Married,52203.0,0,0,36,488,21,238,56,108,28,1,8,7
2,2n Cycle,Married,7500.0,1,0,98,5,17,17,13,14,34,4,2,1
3,Graduation,Married,7500.0,1,0,19,7,0,12,13,7,32,5,4,1
4,Master,Married,82576.0,0,0,66,1206,55,445,168,18,18,1,2,4


In [None]:
df.iloc[:5, 15:]

Unnamed: 0,PurchasesMadeDirectly,NumberOfVisitsToWebsitePerMonth,IsCampaign3Accepted,IsCampaign4Accepted,IsCampaign5Accepted,IsCampaign1Accepted,IsCampaign2Accepted,HasComplainedInLast2Months,IsLastCampaignAccepted,Age,RFM_Score,CustomerSegment,MonthStart,MonthLabel,RFM_Segment
0,5,7,0,0,0,0,0,0,1,55,8,Potential Loyalists,2012-07-01,Jul - 2012,Others
1,11,6,0,0,0,0,0,0,0,75,13,Champions,2012-07-01,Jul - 2012,Champions
2,3,9,0,0,0,0,0,0,0,40,5,At Risk,2012-08-01,Aug - 2012,Sleeping
3,2,9,1,0,0,0,0,0,1,49,9,Loyal Customers,2012-08-01,Aug - 2012,New Customers
4,12,1,0,0,1,0,0,0,0,66,11,Loyal Customers,2012-08-01,Aug - 2012,Loyal


### **Encoding Categorical Variables**

In [None]:
education_order = {'Basic': 1, '2n Cycle': 2, 'Graduation': 3, 'Master': 4, 'PhD': 5}
df['Education'] = df['Education'].map(education_order)

In [None]:
df.drop(columns=['Education'], inplace=True, axis=1)

In [None]:
df = pd.get_dummies(df, columns=['Marital_Status'], drop_first=True)
df['Marital_Status_Single'] = df['Marital_Status_Single'].astype(int)
df.rename(columns={'Marital_Status_Single': 'IsSingle'}, inplace=True)

### **Feature Transformation: Scaling the features using StandardScaler**

In [None]:
flag_cols = ['Kidhome'
            , 'Teenhome'
            , 'IsCampaign1Accepted'
            , 'IsCampaign2Accepted'
            , 'IsCampaign3Accepted'
            , 'IsCampaign4Accepted'
            , 'IsCampaign5Accepted'
            , 'IsLastCampaignAccepted'
            , 'HasComplainedInLast2Months'
            , 'IsSingle']
cols_to_scale = ['Income'
                , 'DaysSinceLastPurchase'
                , 'AmountSpentOnWine'
                ,	'AmountSpentOnFruits'
                ,	'AmountSpentOnMeatProducts'
                ,	'AmountSpentOnFishProducts'
                ,	'AmountSpentOnSweetProducts'
                , 'AmountSpentOnGoldProducts'
                , 'PurchasesMadeWithDiscounts'
                ,	'PurchasesMadeViaWeb'
                ,	'PurchasesMadeViaCatalog'
                ,	'PurchasesMadeDirectly'
                ,	'NumberOfVisitsToWebsitePerMonth'
                , 'Age']

In [None]:
df['Id'] = range(1, len(df) + 1)

In [None]:
flags_df = df[['Id'] + flag_cols].copy()
to_scale_df = df[['Id'] + cols_to_scale].copy()

In [None]:
flags_df.head()

Unnamed: 0,Id,Kidhome,Teenhome,IsCampaign1Accepted,IsCampaign2Accepted,IsCampaign3Accepted,IsCampaign4Accepted,IsCampaign5Accepted,IsLastCampaignAccepted,HasComplainedInLast2Months,IsSingle
0,1,1,1,0,0,0,0,0,1,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,1,0,0,1,0,0
4,5,0,0,0,0,0,0,1,0,0,0


In [None]:
to_scale_df.head()

Unnamed: 0,Id,Income,DaysSinceLastPurchase,AmountSpentOnWine,AmountSpentOnFruits,AmountSpentOnMeatProducts,AmountSpentOnFishProducts,AmountSpentOnSweetProducts,AmountSpentOnGoldProducts,PurchasesMadeWithDiscounts,PurchasesMadeViaWeb,PurchasesMadeViaCatalog,PurchasesMadeDirectly,NumberOfVisitsToWebsitePerMonth,Age
0,1,55158.0,72,293,0,87,4,11,23,4,7,2,5,7,55
1,2,52203.0,36,488,21,238,56,108,28,1,8,7,11,6,75
2,3,7500.0,98,5,17,17,13,14,34,4,2,1,3,9,40
3,4,7500.0,19,7,0,12,13,7,32,5,4,1,2,9,49
4,5,82576.0,66,1206,55,445,168,18,18,1,2,4,12,1,66


In [None]:
scaler = StandardScaler()
scaled_array = scaler.fit_transform(to_scale_df[cols_to_scale])

In [None]:
scaled_df = pd.DataFrame(scaled_array, columns=cols_to_scale)

In [None]:
scaled_df['Id'] = to_scale_df['Id']

In [None]:
df_final = pd.merge(scaled_df, flags_df, on='Id')

In [None]:
df_final.drop('Id', axis=1, inplace=True)

In [None]:
df_final.to_csv('df_scaled_data_for_clustering.csv', index=False)