In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
item_to_id = pd.read_csv('item_to_id.csv')
purchase_history = pd.read_csv('purchase_history.csv')


In [None]:
purchase_history.sample(3)

In [None]:
purchase_history['purchase_id'] = purchase_history.index

In [None]:
purchase_history.info()

In [None]:
item_to_id.sample(3)

In [None]:
item_to_id.info()

# 1. Identify the customer who bought the most items overall


In [None]:
# Splitting the 'id' column in the purchase_history dataframe into individual item IDs
purchase_history['item_list'] = purchase_history['id'].apply(lambda x: list(map(int, x.split(','))))

In [None]:
# Calculate the number of items bought by each customer (length of item list)
purchase_history['num_items'] = purchase_history['item_list'].apply(len)

In [None]:
user_item_matrix = purchase_history.explode('item_list')  # Flatten the list of items for each user
user_item_matrix = user_item_matrix[['user_id', 'item_list']].dropna()

In [None]:
# Create the user-item matrix with 1s and 0s indicating purchase
user_item_matrix = pd.get_dummies(user_item_matrix.set_index('user_id')['item_list'], prefix='', prefix_sep='')

In [None]:
user_item_matrix = user_item_matrix.astype(int)  # Convert booleans to 0s and 1s


In [None]:
user_item_matrix = user_item_matrix.groupby('user_id').sum()

In [None]:
user_item_matrix.sum(axis=1).reset_index(name='count').sort_values(by='count', ascending=False)[:3]

In [None]:
user_item_matrix

# 2. For each item, find the customer who bought it the most


In [None]:
user_item_matrix.idxmax().reset_index(name='user_id').astype({"index":"int"}).merge(item_to_id, left_on='index', right_on='Item_id')[['Item_id', 'Item_name', 'user_id']]

#  Item Clustering Based on Co-purchase History

1. Build a co-occurrence matrix where each cell indicates how often two items were bought together.
2. Use a clustering algorithm (e.g., K-means or hierarchical clustering) to group items with similar co-purchase patterns.

In [None]:
user_item_matrix = pd.get_dummies(purchase_history.explode('item_list').set_index('purchase_id')['item_list'], prefix='', prefix_sep='').astype(int)

In [None]:
user_item_matrix = user_item_matrix.groupby('purchase_id').sum()

In [None]:
user_item_matrix

In [None]:
co_purchase_matrix = user_item_matrix.T.dot(user_item_matrix)

# determine the best number of clusters


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA


In [None]:
# determine the best number of clusters
clusters = range(2, 30)
inertias = []
silhouettes = []

for n_clusters in clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, init='k-means++', n_init=10) # The n_init parameter specifies how many times the KMeans algorithm will be run with different initial centroid seeds.
    kmeans = kmeans.fit(co_purchase_matrix)
    label = kmeans.predict(co_purchase_matrix)
    
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(co_purchase_matrix, label))


# visualization
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
ax[0].plot(clusters, inertias, 'o-', label='Sum of Squared Distances')
ax[0].grid(True)
ax[1].plot(clusters, silhouettes, 'o-', label='Silhouette Coefficient', color='orange')
ax[1].grid(True)
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()

- Elbow Method
    - The inertia decreases steadily as the number of clusters increases, without a clear "elbow point." This suggests that the data might not have strong natural clusters or that a larger number of clusters could be appropriate.
- Silhouette Scores
    - The silhouette scores indicate how well clusters are defined.
    - A higher score suggests better-defined clusters. The scores for most cluster numbers are relatively low, which may indicate weak separation between clusters.

Based on these plots, a smaller number of clusters (e.g., 4 or 5) may provide better segmentation, as the silhouette scores start to decline with higher cluster counts.

In [None]:
# Apply KMeans clustering to the co-occurrence matrix
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)  # Let's start with 5 clusters
kmeans.fit(co_purchase_matrix)

# Assign cluster labels to each item
item_to_id['cluster'] = kmeans.labels_


In [None]:
item_to_id.cluster.value_counts()

In [None]:
# Display the first few rows of the item clustering result (item IDs and their assigned clusters)
item_to_id[['Item_id', 'cluster']].head()


## Analyze purchase trends across all clusters.

In [None]:
# Combine item cluster labels with user-item interaction matrix
user_item_with_clusters = user_item_matrix.copy()
user_item_with_clusters.columns = [int(col) for col in user_item_with_clusters.columns]  # Ensure columns are integers
item_cluster_mapping = item_to_id.set_index('Item_id')['cluster'].to_dict()

# Add cluster labels to each item column
item_clusters = pd.DataFrame({
    'item': user_item_with_clusters.columns,
    'cluster': [item_cluster_mapping[item] for item in user_item_with_clusters.columns]
})

# Group items by their clusters and calculate purchase trends (sum of purchases per cluster)
cluster_purchase_trends = user_item_with_clusters.T.groupby(item_clusters['cluster']).sum().T

# Analyze purchase trends per cluster
cluster_summary = cluster_purchase_trends.sum(axis=0).reset_index()
cluster_summary.columns = ['cluster', 'total_purchases']

In [None]:
cluster_summary

In [None]:
# Reuse KMeans to cluster items
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
clusters = kmeans.fit_predict(co_purchase_matrix)

# Visualize the clusters using PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(co_purchase_matrix)

# Plot the clusters
plt.figure(figsize=(15, 15), dpi=200)
for cluster_label in range(5):
    cluster_items = item_to_id[item_to_id['cluster'] == cluster_label]  # Get items in the current cluster
    cluster_points = reduced_data[clusters == cluster_label]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {cluster_label}")
    # Add text labels for items in the current cluster
    for i, (x, y) in enumerate(cluster_points):
        item_name = cluster_items.iloc[i]['Item_name']
        plt.text(x, y, item_name, fontsize=8, ha='right', va='bottom')

plt.title("Item Clusters Based on Co-purchase History")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()


In [None]:
# Calculate the total purchases per item
item_purchases = user_item_with_clusters.sum(axis=0)

# Add item purchase counts and cluster information
item_purchases_df = pd.DataFrame(
    {
        "item": item_purchases.index,
        "total_purchases": item_purchases.values,
        "cluster": [item_cluster_mapping[item] for item in item_purchases.index],
    }
)

# Find top items in each cluster by sorting within clusters
top_items_per_cluster = (
    item_purchases_df.assign(
        Item_name=lambda x: x["item"].map(
            item_to_id.set_index("Item_id")["Item_name"].to_dict()
        )
    )
    .sort_values(["cluster", "total_purchases"], ascending=[True, False])
    .groupby("cluster")
    .head(5)  # Top 5 items in each cluster
    .reset_index(drop=True)
)


# Visualizing the top items in each cluster
clusters = top_items_per_cluster['cluster'].unique()
plt.figure(figsize=(12, 8))

for cluster in clusters:
    cluster_data = top_items_per_cluster[top_items_per_cluster['cluster'] == cluster]
    plt.barh(cluster_data['Item_name'], cluster_data['total_purchases'], label=f"Cluster {cluster}")

plt.title("Top Items in Each Cluster Based on Purchases")
# plt.ylabel("Item")
# plt.xlabel("Total Purchases")
plt.legend()
# plt.xticks(rotation=45)
plt.tight_layout()
plt.show()