In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import operator

import jsonlines
import pandas as pd
import pyspark
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.clustering import KMeans, GaussianMixture

In [3]:
spark = (
    pyspark.sql.SparkSession
    .builder
    .appName("Python Spark K-means")
    .enableHiveSupport()
    .getOrCreate()
)

In [4]:
path_aggregated_df = "../data/output/joined/"

In [5]:
clustering_df = spark.read.parquet(path_aggregated_df)

In [8]:
columns_clustering_features = [
    'user_lifetime',
    'user_no_outgoing_activity_in_days',
    'user_account_balance_last',
    'user_spendings',
    'reloads_inactive_days',
    'reloads_count',
    'reloads_sum',
    'calls_outgoing_count',
    'calls_outgoing_spendings',
    'calls_outgoing_duration',
    'calls_outgoing_spendings_max',
    'calls_outgoing_duration_max',
    'calls_outgoing_inactive_days',
    'calls_outgoing_to_onnet_count',
    'calls_outgoing_to_onnet_spendings',
    'calls_outgoing_to_onnet_duration',
    'calls_outgoing_to_onnet_inactive_days',
    'calls_outgoing_to_offnet_count',
    'calls_outgoing_to_offnet_spendings',
    'calls_outgoing_to_offnet_duration',
    'calls_outgoing_to_offnet_inactive_days',
    'calls_outgoing_to_abroad_count',
    'calls_outgoing_to_abroad_spendings',
    'calls_outgoing_to_abroad_duration',
    'calls_outgoing_to_abroad_inactive_days',
    'sms_outgoing_count',
    'sms_outgoing_spendings',
    'sms_outgoing_spendings_max',
    'sms_outgoing_inactive_days',
    'sms_outgoing_to_onnet_count',
    'sms_outgoing_to_onnet_spendings',
    'sms_outgoing_to_onnet_inactive_days',
    'sms_outgoing_to_offnet_count',
    'sms_outgoing_to_offnet_spendings',
    'sms_outgoing_to_offnet_inactive_days',
    'sms_outgoing_to_abroad_count',
    'sms_outgoing_to_abroad_spendings',
    'sms_outgoing_to_abroad_inactive_days',
    'sms_incoming_count',
    'sms_incoming_spendings',
    'sms_incoming_from_abroad_count',
    'sms_incoming_from_abroad_spendings',
    'gprs_session_count',
    'gprs_usage',
    'gprs_spendings',
    'gprs_inactive_days',
    'last_100_reloads_count',
    'last_100_reloads_sum',
    'last_100_calls_outgoing_duration',
    'last_100_calls_outgoing_to_onnet_duration',
    'last_100_calls_outgoing_to_offnet_duration',
    'last_100_calls_outgoing_to_abroad_duration',
    'last_100_sms_outgoing_count',
    'last_100_sms_outgoing_to_onnet_count',
    'last_100_sms_outgoing_to_offnet_count',
    'last_100_sms_outgoing_to_abroad_count',
    'last_100_gprs_usage',
    'n_months'
]

In [12]:
vector_assembler = VectorAssembler(
    inputCols=columns_clustering_features, 
    outputCol="initial_features")

In [13]:
standard_scaler = StandardScaler(
    inputCol="initial_features", 
    outputCol="features", 
    withStd=True, 
    withMean=True)

In [14]:
vectorized_df = vector_assembler.transform(clustering_df)
model_scaler = standard_scaler.fit(vectorized_df)
featurized_clustering_df = model_scaler.transform(vectorized_df)

In [15]:
featurization_pipeline = Pipeline(stages=[vector_assembler, standard_scaler])

In [16]:
featurization_pipeline_model = featurization_pipeline.fit(clustering_df)

In [17]:
model_scaler = featurization_pipeline_model.stages[-1]

In [18]:
featurized_clustering_df = featurization_pipeline_model.transform(clustering_df)

In [19]:
k = 5

In [20]:
kmeans = KMeans(featuresCol="features", k=k)

In [21]:
model_kmeans = kmeans.fit(featurized_clustering_df)

In [22]:
path_metrics_kmeans_sse = "../data/metrics_kmeans_see.jsonl"

In [23]:
sse = model_kmeans.computeCost(featurized_clustering_df)

In [24]:
metrics_row = {"k": k, "sse": sse}

with jsonlines.open(path_metrics_kmeans_sse, "a") as f:
    f.write(metrics_row)

In [28]:
normalized_cluster_centers = model_kmeans.clusterCenters()

In [29]:
scaler_mean = model_scaler.mean
scaler_std = model_scaler.std
cluster_sizes = model_kmeans.summary.clusterSizes
n_obs = clustering_df.count()

In [30]:
denormalized_cluster_centers = [
    (cluster_id,) + (size, 100 * size / n_obs) + tuple(center * scaler_std + scaler_mean)
    for cluster_id, (size, center) in 
    enumerate(zip(cluster_sizes, normalized_cluster_centers))
]

In [None]:
cluster_centers_pddf = pd.DataFrame.from_records(denormalized_cluster_centers)
cluster_centers_pddf.columns = (
    ["cluster_id", "cluster_size", "cluster_size_pct"] + 
    columns_clustering_features
)

In [None]:
pd.set_option("max_columns", 999)

In [None]:
path_cluster_centers = "../data/cluster_centers_kmeans__k_{}.csv".format(k)

In [None]:
cluster_centers_pddf.to_csv(path_cluster_centers, index=False)

In [None]:
clustered_kmeans_df = model_kmeans.transform(featurized_clustering_df)

In [None]:
path_clustered_df = "../data/clustered_kmeans__k_{}_parquet".format(k)

In [None]:
clustered_kmeans_df.write.parquet(path_clustered_df)