<a href="https://colab.research.google.com/github/cbonnin88/The_Coffee_Corner/blob/main/customer_segmentation_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.express as px

In [5]:
orders = pd.read_csv('coffee_shop - clean_orders.csv')
products = pd.read_csv('coffee_shop - products.csv')
customers = pd.read_csv('coffee_shop - customers.csv')

In [6]:
# Function to clean currency strings
def clean_currency(x):
  if isinstance(x,str):
    return float(x.replace('â‚¬','').replace(',','.').strip())
  return x

In [7]:
# Apply Cleaning
orders['total_sale'] = orders['total_sale'].apply(clean_currency)
products['profit'] = products['profit'].apply(clean_currency)

In [8]:
# Convert dates to proper datetime objects
orders['order_date'] = pd.to_datetime(orders['order_date'])

In [9]:
# Merge DataFrames (Join logic in Python)
# We join Orders with Products (to get profit/roast info) and Customers (to get locations)
# We use suffixes because both tables have 'coffee_type' and 'country' columns
df_coffee = orders.merge(products, on='product_reference_number', suffixes=('_ord', '_prod'))
df_coffee = df_coffee.merge(customers, on='customer_reference_number')

In [10]:
# Rename 'country_x' to 'country' for easier reading (it comes from the orders table)
df_coffee = df_coffee.rename(columns={'country_x': 'country'})

In [11]:
print("Data Loaded & Merged Successfully!")
print(f"Total Rows: {df_coffee.shape[0]}")
display(df_coffee[['order_date', 'total_sale', 'country', 'coffee_type_ord']].head())

Data Loaded & Merged Successfully!
Total Rows: 17


Unnamed: 0,order_date,total_sale,country,coffee_type_ord
0,2019-12-29,7.77,France,Ara
1,2020-06-07,8.91,United Kingdom,Exc
2,2019-03-04,22.89,United Kingdom,Rob
3,2019-06-17,9.51,France,Lib
4,2021-04-05,4.46,France,Exc


# **Preparing the Data**

In [12]:
# Preparing the data
current_date = orders['order_date'].max() + dt.timedelta(days=1)

In [13]:
# Calculate RFM metrics for each customer
rfm = orders.groupby('customer_reference_number').agg({
    'order_date':lambda x: (current_date - x.max()).days, # Recency
    'order_id': 'count', # Frequency
    'total_sale':'sum'
}).reset_index()

In [14]:
# Rename columns for clarity
rfm.columns = ['customer_id','Recency','Frequency','Monetary']

print('--- Sample of RFM Data ---')
display(rfm.head())

--- Sample of RFM Data ---


Unnamed: 0,customer_id,Recency,Frequency,Monetary
0,1018,1,1,27.0
1,1132,349,1,28.53
2,1150,748,1,68.32
3,1375,48,1,8.92
4,1397,855,1,7.78


# **Preprocessing**

In [15]:
# K-Means is sensitive to scale (e.g., Monetary is 1000x bigger than Frequency)
# We must normalize the data so all features contribute equally.

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency','Frequency','Monetary']])

# **K-Means Clustering**

In [16]:
# Asking the algorithm to find three distinct groups (Clusters)
kmeans = KMeans(n_clusters=3,random_state=42)
rfm['Cluster']= kmeans.fit_predict(rfm_scaled)

# **Visualization**

In [17]:
fig = px.scatter_3d(
    rfm,
    x='Recency',
    y='Frequency',
    z='Monetary',
    color='Cluster',
    title='Customer Segments (RFM Analysis)',
    opacity=0.7,
    size_max=10
)

fig.show()

# **Intepreting the Clusters**

In [19]:
print('\n--- Cluster Averages ---')
display(rfm.groupby('Cluster')[['Recency','Frequency','Monetary']].mean())


--- Cluster Averages ---


Unnamed: 0_level_0,Recency,Frequency,Monetary
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,920.75,1.0,15.03375
1,742.8,1.0,57.158
2,104.0,1.0,27.25
