In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [3]:
df = pd.read_csv('./data/data.csv')

df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])


In [4]:
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)


In [5]:
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency
    'TransactionId': 'count',                                           # Frequency
    'Amount': 'sum'                                                     # Monetary
}).reset_index()

rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary
0,CustomerId_1,84,1,-10000.0
1,CustomerId_10,84,1,-10000.0
2,CustomerId_1001,90,5,20000.0
3,CustomerId_1002,26,11,4225.0
4,CustomerId_1003,12,6,20000.0


In [6]:
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(
    rfm[['Recency', 'Frequency', 'Monetary']]
)


In [7]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
rfm['cluster'] = kmeans.fit_predict(rfm_scaled)

rfm.head()


Unnamed: 0,CustomerId,Recency,Frequency,Monetary,cluster
0,CustomerId_1,84,1,-10000.0,0
1,CustomerId_10,84,1,-10000.0,0
2,CustomerId_1001,90,5,20000.0,0
3,CustomerId_1002,26,11,4225.0,1
4,CustomerId_1003,12,6,20000.0,1


In [28]:

cluster_summary = rfm.groupby('cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
})

# Identify cluster
high_risk_cluster = cluster_summary['Frequency'].idxmin()
print("High-risk cluster:", high_risk_cluster)

# Create binary target variable
rfm['is_high_risk'] = (rfm['cluster'] == high_risk_cluster).astype(int)
rfm[['CustomerId', 'Recency', 'Frequency', 'Monetary', 'cluster', 'is_high_risk']].head()


High-risk cluster: 0


Unnamed: 0,CustomerId,Recency,Frequency,Monetary,cluster,is_high_risk
0,CustomerId_1,84,1,-10000.0,0,1
1,CustomerId_10,84,1,-10000.0,0,1
2,CustomerId_1001,90,5,20000.0,0,1
3,CustomerId_1002,26,11,4225.0,1,0
4,CustomerId_1003,12,6,20000.0,1,0


In [32]:
# -----------------------------
df = df.merge(
    rfm[['CustomerId','is_high_risk']],
    on='CustomerId',
    how='left'
)

# Verify
df['is_high_risk'].value_counts()

is_high_risk
0    84653
1    11009
Name: count, dtype: int64

In [9]:
high_risk_cluster = cluster_summary['Frequency'].idxmin()


In [25]:
df['is_high_risk'].value_counts()
rfm.groupby('is_high_risk')[['Recency','Frequency','Monetary']].mean()


Unnamed: 0_level_0,Recency,Frequency,Monetary
is_high_risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,12.733592,36.551382,227162.752958
1,61.877279,7.720196,81720.679979


In [33]:
df.to_csv('./data/final_training_data.csv', index=False)
print("Final dataset with proxy target saved ✅")

Final dataset with proxy target saved ✅
