In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import euclidean_distances
from sqlalchemy import create_engine

In [2]:
# Connect to PostgreSQL database
engine = create_engine('postgresql://postgres:root@localhost/telecom')
query = "SELECT * FROM xdr_data"
data = pd.read_sql(query, engine)

In [4]:
# Select relevant columns for analysis
relevant_columns = ["Total DL (Bytes)", "Gaming UL (Bytes)", "Other DL (Bytes)", "Other UL (Bytes)",
                    "Avg RTT DL (ms)", "Avg RTT UL (ms)", "Avg Bearer TP DL (kbps)", "Avg Bearer TP UL (kbps)"]
data_subset = data[relevant_columns]


In [7]:
# Check for missing values
print("Missing values per column:")
print(data_subset.isnull().sum())

# Impute missing values with the mean of each column
data_subset.fillna(data_subset.mean(), inplace=True)

Missing values per column:
Total DL (Bytes)               1
Gaming UL (Bytes)              0
Other DL (Bytes)               0
Other UL (Bytes)               0
Avg RTT DL (ms)            27829
Avg RTT UL (ms)            27812
Avg Bearer TP DL (kbps)        1
Avg Bearer TP UL (kbps)        1
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_subset.fillna(data_subset.mean(), inplace=True)


In [8]:

# Task 4.1: Assign engagement and experience scores
kmeans_engagement = KMeans(n_clusters=2, random_state=42)
kmeans_experience = KMeans(n_clusters=2, random_state=42)


In [9]:
# Fit clustering for engagement and experience
kmeans_engagement.fit(data_subset)
kmeans_experience.fit(data_subset)


In [10]:
# Get cluster centers
engagement_clusters = kmeans_engagement.cluster_centers_
experience_clusters = kmeans_experience.cluster_centers_


In [11]:
# Define the "less engaged" and "worst experience" clusters
less_engaged_cluster = engagement_clusters[0]  # Assume cluster 0 is less engaged
worst_experience_cluster = experience_clusters[0]  # Assume cluster 0 is worst experience

In [12]:
# Calculate engagement and experience scores
data["engagement_score"] = euclidean_distances(data_subset, [less_engaged_cluster]).flatten()
data["experience_score"] = euclidean_distances(data_subset, [worst_experience_cluster]).flatten()


In [13]:

# Task 4.2: Calculate satisfaction score
data["satisfaction_score"] = data[["engagement_score", "experience_score"]].mean(axis=1)
top_10_satisfied = data.nlargest(10, "satisfaction_score")
print("Top 10 satisfied customers:")
print(top_10_satisfied)

Top 10 satisfied customers:
           Bearer Id            Start  Start ms              End  End ms  \
64036   7.349883e+18   4/26/2019 6:55     948.0   4/27/2019 4:47   872.0   
60082   1.304243e+19   4/26/2019 3:00     174.0   4/27/2019 3:00   148.0   
16855   7.277826e+18  4/24/2019 19:19     769.0   4/25/2019 0:48   471.0   
140950  1.304243e+19   4/29/2019 3:48     498.0   4/30/2019 3:48   426.0   
128778  1.311448e+19   4/28/2019 8:22     554.0  4/30/2019 15:59   612.0   
964     7.277826e+18   4/24/2019 0:41     269.0   4/25/2019 0:41   296.0   
35417   1.304243e+19   4/25/2019 1:00     894.0   4/26/2019 1:00   808.0   
138809  1.304243e+19   4/29/2019 2:30     355.0   4/30/2019 2:30   354.0   
116473  7.277826e+18   4/28/2019 1:30     877.0   4/29/2019 1:30   773.0   
95150   1.304243e+19  4/27/2019 16:16      49.0   4/28/2019 0:59   275.0   

        Dur. (ms)          IMSI  MSISDN/Number          IMEI  \
64036     78729.0  2.082017e+14   3.365092e+10  3.530281e+13   
60082  

In [14]:
# Task 4.3: Build a regression model to predict satisfaction score
X = data[["engagement_score", "experience_score"]]
y = data["satisfaction_score"]
reg_model = LinearRegression()
reg_model.fit(X, y)
print("Regression model coefficients:", reg_model.coef_)

Regression model coefficients: [0.5 0.5]


In [15]:
# Task 4.4: Run k-means clustering (k=2) on engagement and experience scores
kmeans_satisfaction = KMeans(n_clusters=2, random_state=42)
kmeans_satisfaction.fit(data[["engagement_score", "experience_score"]])
data["satisfaction_cluster"] = kmeans_satisfaction.labels_


In [16]:
# Task 4.5: Aggregate average satisfaction and experience scores per cluster
cluster_aggregates = data.groupby("satisfaction_cluster")[
    ["satisfaction_score", "experience_score"]
].mean()
print("Cluster aggregates:")
print(cluster_aggregates)


Cluster aggregates:
                      satisfaction_score  experience_score
satisfaction_cluster                                      
0                           5.103832e+08      5.103832e+08
1                           2.372602e+08      2.372602e+08


In [18]:
# Task 4.6: Export data to MySQL
engine = create_engine('postgresql://postgres:root@localhost/telecom')
data_to_export = data[["Bearer Id", "engagement_score", "experience_score", "satisfaction_score"]]
data_to_export.to_sql("satisfaction_analysis", con=engine, if_exists="replace", index=False)

1

In [19]:
# Example SQL Query Output
query_result = pd.read_sql_query("SELECT * FROM satisfaction_analysis LIMIT 10;", con=engine)
print("Example data from MySQL:")
print(query_result)


Example data from MySQL:
      Bearer Id  engagement_score  experience_score  satisfaction_score
0  1.311448e+19      4.347967e+08      4.347967e+08        4.347967e+08
1  1.311448e+19      1.067570e+08      1.067570e+08        1.067570e+08
2  1.311448e+19      3.853633e+08      3.853633e+08        3.853633e+08
3  1.311448e+19      3.745458e+08      3.745458e+08        3.745458e+08
4  1.311448e+19      1.612275e+08      1.612275e+08        1.612275e+08
5  1.311448e+19      2.503823e+08      2.503823e+08        2.503823e+08
6  1.311448e+19      7.067903e+08      7.067903e+08        7.067903e+08
7  1.304243e+19      1.606175e+08      1.606175e+08        1.606175e+08
8  1.311448e+19      3.168431e+08      3.168431e+08        3.168431e+08
9  1.304243e+19      4.019122e+08      4.019122e+08        4.019122e+08
