In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean

In [2]:
df = pd.read_csv('telcom_data.csv')

In [3]:
df.columns = [col.strip().replace(' ', '_').replace('.', '') for col in df.columns]

In [4]:
df.shape

(150001, 55)

In [5]:
df.columns

Index(['Bearer_Id', 'Start', 'Start_ms', 'End', 'End_ms', 'Dur_(ms)', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last_Location_Name', 'Avg_RTT_DL_(ms)',
       'Avg_RTT_UL_(ms)', 'Avg_Bearer_TP_DL_(kbps)', 'Avg_Bearer_TP_UL_(kbps)',
       'TCP_DL_Retrans_Vol_(Bytes)', 'TCP_UL_Retrans_Vol_(Bytes)',
       'DL_TP_<_50_Kbps_(%)', '50_Kbps_<_DL_TP_<_250_Kbps_(%)',
       '250_Kbps_<_DL_TP_<_1_Mbps_(%)', 'DL_TP_>_1_Mbps_(%)',
       'UL_TP_<_10_Kbps_(%)', '10_Kbps_<_UL_TP_<_50_Kbps_(%)',
       '50_Kbps_<_UL_TP_<_300_Kbps_(%)', 'UL_TP_>_300_Kbps_(%)',
       'HTTP_DL_(Bytes)', 'HTTP_UL_(Bytes)', 'Activity_Duration_DL_(ms)',
       'Activity_Duration_UL_(ms)', 'Dur_(ms)1', 'Handset_Manufacturer',
       'Handset_Type', 'Nb_of_sec_with_125000B_<_Vol_DL',
       'Nb_of_sec_with_1250B_<_Vol_UL_<_6250B',
       'Nb_of_sec_with_31250B_<_Vol_DL_<_125000B',
       'Nb_of_sec_with_37500B_<_Vol_UL',
       'Nb_of_sec_with_6250B_<_Vol_DL_<_31250B',
       'Nb_of_sec_with_6250B_<_Vol_UL_<_37500B',
     

#  --------------- Task 4.1 Configuration -----------------

In [6]:
# Select features for engagement and experience analysis
engagement_features = ['Avg_Bearer_TP_DL_(kbps)', 'Avg_Bearer_TP_UL_(kbps)',
                       'Activity_Duration_DL_(ms)', 'Activity_Duration_UL_(ms)']
experience_features = ['Avg_RTT_DL_(ms)', 'Avg_RTT_UL_(ms)', 'TCP_DL_Retrans_Vol_(Bytes)',
                       'TCP_UL_Retrans_Vol_(Bytes)', 'DL_TP_<_50_Kbps_(%)', 'UL_TP_<_10_Kbps_(%)']

In [7]:
from sklearn.impute import SimpleImputer

# Impute missing values for engagement
imp_eng = SimpleImputer(strategy='mean')
engagement_data_imputed = imp_eng.fit_transform(df[engagement_features])
scaler_eng = StandardScaler()
scaled_engagement = scaler_eng.fit_transform(engagement_data_imputed)

# Impute missing values for experience
imp_exp = SimpleImputer(strategy='mean')
experience_data_imputed = imp_exp.fit_transform(df[experience_features])
scaler_exp = StandardScaler()
scaled_experience = scaler_exp.fit_transform(experience_data_imputed)


In [8]:
# Clustering
from sklearn.cluster import KMeans

kmeans_eng = KMeans(n_clusters=2, random_state=42)
df['engagement_cluster'] = kmeans_eng.fit_predict(scaled_engagement)

kmeans_exp = KMeans(n_clusters=2, random_state=42)
df['experience_cluster'] = kmeans_exp.fit_predict(scaled_experience)

[WinError 2] The system cannot find the file specified
  File "C:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [9]:
# Find centroids
eng_centroids = kmeans_eng.cluster_centers_
exp_centroids = kmeans_exp.cluster_centers_

In [10]:
# Assume lower average values = worse engagement/experience
less_engaged_cluster = np.argmin(np.mean(eng_centroids, axis=1))
worst_experience_cluster = np.argmax(np.mean(exp_centroids, axis=1))

In [11]:
# Engagement Score (distance from less engaged cluster)
df['engagement_score'] = [euclidean(point, eng_centroids[less_engaged_cluster])
                          for point in scaled_engagement]

In [12]:
# Experience Score (distance from worst experience cluster)
df['experience_score'] = [euclidean(point, exp_centroids[worst_experience_cluster])
                          for point in scaled_experience]

# -------- Task 4.2 - Satisfaction Score + Top 10 Customers -------

In [13]:
# Satisfaction score: average of engagement and experience scores
df['satisfaction_score'] = df[['engagement_score', 'experience_score']].mean(axis=1)

# Top 10 satisfied customers
top_10 = df.sort_values(by='satisfaction_score', ascending=False).head(10)
print(top_10[['Bearer_Id', 'satisfaction_score']])


           Bearer_Id  satisfaction_score
36181   1.304240e+19           97.031322
76625   1.304240e+19           88.361635
30155   7.349880e+18           86.731511
67304   1.304240e+19           81.652014
1491    1.304240e+19           77.449380
35127   7.277830e+18           62.868172
133262  1.304240e+19           60.656232
29916   7.349880e+18           57.866765
17894            NaN           49.969324
5963    1.304240e+19           49.119320


# -------- Task 4.3 - Regression Model to Predict Satisfaction --------

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Combine engagement and experience features
features = engagement_features + experience_features

# Extract X and y
X = df[features]
y = df['satisfaction_score']

# Impute missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Train linear regression
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predict
y_pred = reg.predict(X_test)

# Evaluate
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R² Score: 0.9625932887212284
MSE: 0.09026168689379033


# -------- Task 4.4 - KMeans on Engagement & Experience Scores --------

In [19]:
kmeans_satisfaction = KMeans(n_clusters=2, random_state=42)
df['satisfaction_cluster'] = kmeans_satisfaction.fit_predict(df[['engagement_score', 'experience_score']])
print(kmeans_satisfaction)

KMeans(n_clusters=2, random_state=42)


# -------- Task 4.5 - Aggregate by Cluster --------

In [20]:
cluster_summary = df.groupby('satisfaction_cluster')[['satisfaction_score', 'experience_score']].mean()
print(cluster_summary)


                      satisfaction_score  experience_score
satisfaction_cluster                                      
0                               0.721844          0.783201
1                               4.641814          4.055313


# -------- Task 4.6 - Export to MySQL --------

In [21]:
pip install pyodbc sqlalchemy

Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install pymysql

Note: you may need to restart the kernel to use updated packages.


# -------- Task 4.7 - Model Deployment Tracking (Basic MLflow Example) --------