In [None]:
# Additional imports needed
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator

# Calculate engagement metrics per customer
user_engagement = xdr_data.groupby('MSISDN').agg({
    'Session_ID': 'count',  # session frequency
    'Duration': 'sum',      # total duration
    'Total_DL_Bytes': 'sum',  # download bytes
    'Total_UL_Bytes': 'sum'   # upload bytes
}).reset_index()

# Add total traffic column
user_engagement['Total_Traffic'] = user_engagement['Total_DL_Bytes'] + user_engagement['Total_UL_Bytes']

# Rename columns for clarity
user_engagement.columns = ['MSISDN', 'Session_Frequency', 'Total_Duration', 
                         'Download_Bytes', 'Upload_Bytes', 'Total_Traffic']

# Task 2.1.1: Display top 10 customers per metric
print("Top 10 Customers by Session Frequency:")
display(user_engagement.nlargest(10, 'Session_Frequency')[['MSISDN', 'Session_Frequency']])

print("\nTop 10 Customers by Duration:")
display(user_engagement.nlargest(10, 'Total_Duration')[['MSISDN', 'Total_Duration']])

print("\nTop 10 Customers by Total Traffic:")
display(user_engagement.nlargest(10, 'Total_Traffic')[['MSISDN', 'Total_Traffic']])

# Prepare data for clustering
features_for_clustering = ['Session_Frequency', 'Total_Duration', 'Total_Traffic']
X = user_engagement[features_for_clustering]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features_for_clustering)

# Perform k-means clustering with k=3
kmeans = KMeans(n_clusters=3, random_state=42)
user_engagement['Cluster'] = kmeans.fit_predict(X_scaled)

# Calculate cluster statistics
cluster_stats = user_engagement.groupby('Cluster').agg({
    'Session_Frequency': ['min', 'max', 'mean', 'sum'],
    'Total_Duration': ['min', 'max', 'mean', 'sum'],
    'Total_Traffic': ['min', 'max', 'mean', 'sum']
})

# Display cluster statistics
print("\nCluster Statistics:")
display(cluster_stats)

# Visualize cluster characteristics
plt.figure(figsize=(15, 5))

# Plot 1: Session Frequency vs Duration
plt.subplot(1, 3, 1)
sns.scatterplot(data=user_engagement, x='Session_Frequency', y='Total_Duration', 
                hue='Cluster', palette='viridis')
plt.title('Clusters: Session Frequency vs Duration')

# Plot 2: Session Frequency vs Total Traffic
plt.subplot(1, 3, 2)
sns.scatterplot(data=user_engagement, x='Session_Frequency', y='Total_Traffic', 
                hue='Cluster', palette='viridis')
plt.title('Clusters: Session Frequency vs Total Traffic')

# Plot 3: Duration vs Total Traffic
plt.subplot(1, 3, 3)
sns.scatterplot(data=user_engagement, x='Total_Duration', y='Total_Traffic', 
                hue='Cluster', palette='viridis')
plt.title('Clusters: Duration vs Total Traffic')

plt.tight_layout()
plt.show()

# Top 10 users per application
app_user_traffic = xdr_data.groupby(['MSISDN', 'Application']).agg({
    'Total_DL_Bytes': 'sum',
    'Total_UL_Bytes': 'sum'
}).reset_index()

app_user_traffic['Total_Traffic'] = app_user_traffic['Total_DL_Bytes'] + app_user_traffic['Total_UL_Bytes']

# Get top 10 users for each application
top_users_per_app = app_user_traffic.sort_values('Total_Traffic', ascending=False).groupby('Application').head(10)
print("\nTop 10 users per application:")
display(top_users_per_app)

# Top 3 most used applications
top_apps = xdr_data.groupby('Application').agg({
    'Total_DL_Bytes': 'sum',
    'Total_UL_Bytes': 'sum'
}).reset_index()

top_apps['Total_Traffic'] = top_apps['Total_DL_Bytes'] + top_apps['Total_UL_Bytes']
top_3_apps = top_apps.nlargest(3, 'Total_Traffic')

# Plot top 3 applications
plt.figure(figsize=(10, 6))
sns.barplot(data=top_3_apps, x='Application', y='Total_Traffic')
plt.title('Top 3 Most Used Applications')
plt.xticks(rotation=45)
plt.show()

# Find optimal k using elbow method
inertias = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

# Find the elbow point
kl = KneeLocator(K, inertias, curve='convex', direction='decreasing')
optimal_k = kl.elbow
print(f"\nOptimal number of clusters: {optimal_k}")