# User Badge Assignment Based on Activity

## Prepare Data
### Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA

# Load data
file_path = '/mnt/data/Badge_Data.csv'
data = pd.read_csv(file_path)

### Explore

In [None]:
data.info()
data.describe()

### Visualizations

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(2, 2, 1)
sns.histplot(data['modules_completed'], kde=True)
plt.title('Distribution of Modules Completed')

plt.subplot(2, 2, 2)
sns.histplot(data['modeule_frequency'], kde=True)
plt.title('Distribution of Module Frequency')

plt.subplot(2, 2, 3)
sns.histplot(data['quizzes_completed'], kde=True)
plt.title('Distribution of Quizzes Completed')

plt.subplot(2, 2, 4)
sns.histplot(data['quizzes_frequency'], kde=True)
plt.title('Distribution of Quizzes Frequency')

plt.tight_layout()
plt.show()

## Build Model
### Scale Data

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[['modules_completed', 'modeule_frequency', 'quizzes_completed', 'quizzes_frequency']])

### Features

In [None]:
X = scaled_data

### Instantiate Model

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)

### Train Model

In [None]:
kmeans.fit(X)

### Evaluate Model

In [None]:
labels = kmeans.labels_
sil_score = silhouette_score(X, labels)
dbs_score = davies_bouldin_score(X, labels)

print(f'Silhouette Score: {sil_score}')
print(f'Davies-Bouldin Score: {dbs_score}')

### PCA Dimensional Reduction

In [None]:
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X)

### Visualise Clusters

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(pca_components[:, 0], pca_components[:, 1], c=labels, cmap='viridis')
plt.title('Clusters Visualization with PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar()
plt.show()

## Final Table

In [None]:
# Map clusters to badges
badges = {0: 'Platinum', 1: 'Gold', 2: 'Silver', 3: 'Bronze'}
data['badge'] = labels.map(badges)

# Add badges as a column to the original df
data.head()

## Conclusion


Observation:
- The clustering algorithm grouped users based on their activity in terms of module completion, module frequency, quizzes completed, and quizzes frequency.
- The Silhouette Score and Davies-Bouldin Score indicate the quality of the clusters.

Challenges:
- Choosing the optimal number of clusters was challenging and may require further tuning.
- The clustering is sensitive to the scale of the features, hence scaling was crucial.

Further Work:
- Experiment with different clustering algorithms such as DBSCAN or hierarchical clustering.
- Analyze the clusters further to understand user behaviors and improve the badge assignment system.
