# Clustering of user features


# Preprocessing

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('user_features.csv')
df

In [None]:
df = pd.read_csv('user_features.csv').drop(columns=['user_id'])

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
binary_cols = ['married', 'has_children']
df[binary_cols] = df[binary_cols].astype('int64')

In [None]:
df

# Groups found in EDA of user_features

In [None]:
df_cancelled = df[df['canceled_trips'] != 0].copy() # Group cancelled

In [None]:
df_cancelled

In [None]:
# Calculate the percentage of users who have cancelled trips
percentage_cancelled = (len(df_cancelled) / (len(df_cancelled) + len(df))) * 100

# Print the percentage
print(f"Percentage of users who have cancelled trips: {percentage_cancelled:.2f}%")

In [None]:
df = df[df['canceled_trips'] == 0].copy() #not canceled group
df.drop(columns=['canceled_trips'], inplace=True)

In [None]:
df_abi = df[df['age'] == 18].copy() # Group age 18
df_abi

In [None]:
# Calculate the percentage of users who are 18 years old
percentage_18 = (len(df_abi) / (len(df_abi) + len(df))) * 100

# Print the percentage
print(f"Percentage of users who are 18 years old: {percentage_18:.2f}%")

In [None]:
df = df[df['age'] != 18].copy()

In [None]:
df_discounted_hotel = df[df['hotel_discount_amount'] != 0].copy() # Group discounted hotel
df_discounted_hotel


In [None]:
df_not_discounted_hotel = df[df['hotel_discount_amount'] == 0].copy() # Group not discounted hotel
df_not_discounted_hotel

In [None]:
# Group data by 'amount_of_trips' and count occurrences for each group, excluding 0 trips
discounted_trips_counts_hotel = df_discounted_hotel[df_discounted_hotel['amount_of_trips'] != 0]['amount_of_trips'].value_counts().sort_index()
not_discounted_trips_counts = df_not_discounted_hotel[df_not_discounted_hotel['amount_of_trips'] != 0]['amount_of_trips'].value_counts().sort_index()

# Create a bar chart
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
width = 0.35  # Width of the bars

plt.bar(discounted_trips_counts_hotel.index - width/2, discounted_trips_counts_hotel.values, width, label='Discounted Hotel')
plt.bar(not_discounted_trips_counts.index + width/2, not_discounted_trips_counts.values, width, label='Not Discounted Hotel')

# Customize chart
plt.xlabel('Amount of Trips')
plt.ylabel('Number of Users')
plt.title('Discounted vs. Not Discounted Hotel')
plt.xticks(discounted_trips_counts_hotel.index)  # Ensure all trip values are shown on x-axis
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df_discounted_flight = df[df['flight_discount_amount'] != 0].copy() # Group discounted flight
df_discounted_flight

In [None]:
df_not_discounted_flight = df[df['flight_discount_amount'] == 0].copy() # Group not discounted flight
df_not_discounted_flight

In [None]:
# Group data by 'amount_of_trips' and count occurrences for each group, excluding 0 trips
discounted_trips_counts_flight = df_discounted_flight[df_discounted_flight['amount_of_trips'] != 0]['amount_of_trips'].value_counts().sort_index()
not_discounted_trips_counts_flight = df_not_discounted_flight[df_not_discounted_flight['amount_of_trips'] != 0]['amount_of_trips'].value_counts().sort_index()

# Create a bar chart
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
width = 0.35  # Width of the bars

plt.bar(discounted_trips_counts_flight.index - width/2, discounted_trips_counts_flight.values, width, label='Discounted Flight')
plt.bar(not_discounted_trips_counts_flight.index + width/2, not_discounted_trips_counts_flight.values, width, label='Not Discounted Flight')

# Customize chart
plt.xlabel('Amount of Trips')
plt.ylabel('Number of Users')
plt.title('Discounted vs. Not Discounted Flight')
plt.xticks(discounted_trips_counts_flight.index)  # Ensure all trip values are shown on x-axis
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df

In [None]:
df = df.drop(columns=['hotel_discount_amount', 'flight_discount_amount', 'tenure_months', 'session_duration_invalid_avg_sec'])



In [None]:
df

# Scaling

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled

## PCA

In [None]:
# PCA
pca = PCA()
pca_result = pca.fit_transform(df_scaled)
pca_result

In [None]:
pca.explained_variance_ratio_

In [None]:
# Cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='green')
plt.title('Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()


In [None]:
# only first 8 components are needed
pca_result = pca_result[:, :8]
pca_result

# KMeans

In [None]:
X = pca_result # df_scaled can also be used

In [None]:
X_df_scaled = df_scaled

In [None]:
# Calculate Within-Cluster-Sum-of-Squares (WCSS) for different k values
wcss = []
for k in range(1, 8):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(range(1, 8), wcss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10) #7
kmeans.fit(X)

In [None]:
kmeans.labels_

In [None]:
from sklearn.metrics import silhouette_score
silhouette = silhouette_score(X, kmeans.labels_)

print(f"The average silhouette score is: {silhouette}")

In [None]:
df['cluster'] = kmeans.labels_
df

In [None]:
df.info()

In [None]:
df['cluster'].value_counts()

In [None]:
df.groupby('cluster').mean()

In [None]:
df.mean()

In [None]:
df_cancelled['canceled_trips'].value_counts()

In [None]:
# Get the list of columns (dimensions) to visualize, excluding specified columns
columns_to_visualize = [col for col in df.columns
                        if col not in ['cluster', 'gender', 'married', 'has_children', 'age','tenure_months','flight_discount_amount','hotel_discount_amount']]

# Calculate number of rows and columns for subplots
num_cols = len(columns_to_visualize)
num_rows = (num_cols + 2) // 3  # Calculate number of rows for grid, ensuring at least 4

# Create subplots
fig, axes = plt.subplots(num_rows, 3, figsize=(15, num_rows * 5))  # Adjust figsize as needed

# Iterate through each dimension (column) and plot on subplots
for i, col in enumerate(columns_to_visualize):
    row = i // 3  # Calculate subplot row index
    col_num = i % 3  # Calculate subplot column index

    # Create the visualization on the current subplot
    if df[col].dtype in ['int64', 'float64']:
        sns.boxplot(data=df, x='cluster', y=col, showfliers=False, ax=axes[row, col_num])
    else:
        sns.countplot(data=df, x=col, hue='cluster', ax=axes[row, col_num])

    axes[row, col_num].set_title(f"Distribution of '{col}' across Clusters")
    axes[row, col_num].set_xlabel("Cluster")
    axes[row, col_num].set_ylabel(col)

# Hide any unused subplots
for i in range(len(columns_to_visualize), num_rows * 3):
    row = i // 3
    col_num = i % 3
    fig.delaxes(axes[row, col_num])  # Remove unused subplot

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()



## 🧠 Cluster-Based Customer Segmentation Analysis & Suggested Perks

Based on qualitative cluster profiling using behavioral, demographic, and engagement features, we recommend the following perks tailored to each segment:

---

### 🔹 Cluster 0: Family-Oriented Frequent Flyers
- **Hotel Count**: High / Very High  
- **Flight Count**: High  
- **Checked Bags / Seats Avg**: High  
- **Family**: Many  
- **Married**: Most  
- **Session Time**: High / Very High  

🎁 **Suggested Perk**: *Free Checked Bag + Kids Stay Free*  
> These customers travel frequently with family. Practical perks that reduce family travel cost will boost loyalty and satisfaction.

---

### 🔹 Cluster 1: Low-Engagement Budget Users
- **Hotel & Flight Price**: Very Low  
- **Trip Count**: Very Low / Low  
- **Family**: Few  
- **Session Time**: Very Low  
- **Nights Avg**: Low / Very Low  

🎁 **Suggested Perk**: *Exclusive Discounts on First Booking*  
> Price-conscious and not very active. Activate this group with limited-time or first-time user discounts.

---

### 🔹 Cluster 2: High-End Hybrid Travelers
- **Hotel & Flight Price**: Very High  
- **Seats / Trip Count**: High  
- **Family**: Some  
- **Session Time**: Medium / High  

🎁 **Suggested Perk**: *1 Night Free Hotel with Flight*  
> These are high-spending, consistent travelers. A premium bundled offer encourages retention and upsell.

---

### 🔹 Cluster 3: Business or Power Travelers
- **Checked Bags / Seats**: High  
- **Family / Married**: Few / Very Few  
- **Session Time**: Medium / High  
- **Hotel Nights**: Medium  

🎁 **Suggested Perk**: *Free Hotel Meal or Express Check-In*  
> Likely business travelers or frequent solo flyers. Offer perks that add comfort or convenience.

---

### ✅ Summary Table

| Cluster | Traveler Type               | Suggested Perk                                 |
|---------|-----------------------------|------------------------------------------------|
| 0       | Family Frequent Flyers      | Free Checked Bag + Kids Stay Free              |
| 1       | Budget Inactive Users       | Exclusive Discounts on First Booking           |
| 2       | High-Spend Hybrid Travelers | 1 Night Free Hotel with Flight                 |
| 3       | Business / Power Travelers  | Free Hotel Meal or Express Check-In            |


# Analyze the clusters


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

num_cols = len(df.columns)
num_rows = (num_cols + 2) // 3  # Calculate number of rows for grid

fig, axes = plt.subplots(num_rows, 3, figsize=(15, num_rows * 5))  # Adjust figsize as needed

for i, col in enumerate(df.columns):
    row = i // 3
    col_num = i % 3
    sns.boxplot(data=df, y=col, hue='cluster', ax=axes[row, col_num])
    axes[row, col_num].set_title(col)  # Set title for each

plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()



In [None]:
for col in df.columns:
    sns.boxplot(data=df,y=col, hue='cluster')
    plt.show()
    print('\n')

In [None]:
# Rebuild cluster summary
qualitative_cols = [
    "hotel_price_avg", "flight_price_avg", "hotel_count", "flight_count",
    "has_children", "session_duration_avg_sec", "married", "age", "amount_of_trips", "seats_avg",
    "checked_bags_avg", "nights_avg"
]

cluster_summary = df.groupby("cluster")[qualitative_cols].mean()

def label_quantiles(series, labels):
    """
    Label values based on quantile thresholds
    """
    quantiles = series.quantile([0.2, 0.4, 0.6, 0.8]).values
    return series.apply(
        lambda x: labels[0] if x <= quantiles[0] else
                  labels[1] if x <= quantiles[1] else
                  labels[2] if x <= quantiles[2] else
                  labels[3] if x <= quantiles[3] else
                  labels[4]
    )

# Label map depending on type of column
label_map = {
    "default": ["Very Low", "Low", "Medium", "High", "Very High"],
    "binary_like": ["Very Few", "Few", "Some", "Many", "Most"]
}

# Custom labeling logic
qualitative_summary = pd.DataFrame(index=cluster_summary.index)
binary_like_columns = ["has_children", "married"]
for col in cluster_summary.columns:
    if col in binary_like_columns:  # binary-like features
        qualitative_summary[col] = label_quantiles(cluster_summary[col], label_map["binary_like"])
    else:
        qualitative_summary[col] = label_quantiles(cluster_summary[col], label_map["default"])

# Optional renaming for presentation
qualitative_summary.rename(columns={
    "has_children": "Family",
    "canceled_trips": "Cancellations",
    "session_duration_avg_sec": "Session Time",
    "hotel_price_avg": "Hotel Price",
    "flight_price_avg": "Flight Price",
    "hotel_count": "Hotel Count",
    "flight_count": "Flight Count"
}, inplace=True)

# Display final qualitative cluster summary
display(qualitative_summary)

# PCA - visualization


In [None]:
# Plotly 3d
fig = px.scatter_3d(pca_result[:,:3], x=0, y=1, z=2, color=kmeans.labels_)
fig.update_traces(marker=dict(size=0.5))
fig.show()

In [None]:
fig.write_html("plots/plot.html")