# Notebook to explore the data for the clustering step

In [0]:
spark.conf.set("spark.sql.parquet.columnarReaderBatchSize", 1024)

In [0]:
from pyspark.sql import functions as sf
input_container_path = f"wasbs://energypatents@challengebasf.blob.core.windows.net"
input_blob_folder = f"{input_container_path}/output_data_v1/"
df = spark.read.parquet(input_blob_folder)
df = df.select("_file", "prediction", "english_text_features")
df = df.filter(sf.col("prediction") == 1)
df = df.drop("prediction")

In [0]:
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.pipeline import Pipeline

min_df = 0.10
max_df = 0.80
cv = CountVectorizer(inputCol="english_text_features", outputCol="vectors", minDF=min_df, maxDF=max_df)
idf = IDF(inputCol="vectors", outputCol="features")
pipe = Pipeline(stages=[cv, idf])
model = pipe.fit(df)
result_idf = model.transform(df)

In [0]:
result_idf = result_idf.cache()

In [0]:
from pyspark.ml.clustering import KMeans
n = 10
list_n = [4,6,8,10,15,20]
for n in list_n:
  k_means = KMeans(k=n, maxIter=100, seed=18,  distanceMeasure="cosine")
  k_means_model = k_means.fit(result_idf)
  print(n)
  print(k_means_model.summary.clusterSizes)
  print(k_means_model.summary.trainingCost)

We choose n=10 (elbow rule)

In [0]:
n = 10
k_means = KMeans(k=n, maxIter=100, seed=18,  distanceMeasure="cosine")
k_means_model = k_means.fit(result_idf)
print(n)
print(k_means_model.summary.clusterSizes)
print(k_means_model.summary.trainingCost)

In [0]:
result_final = k_means_model.transform(result_idf)

In [0]:
result_final.printSchema()

In [0]:
result_final = result_final.cache()

In [0]:
result_final.select("prediction").drop_duplicates().show()

In [0]:
centers = k_means_model.clusterCenters()
print(f"Centers: {centers}")
summary = k_means_model.summary
summary.k

In [0]:
summary.clusterSizes

In [0]:
import pandas as pd
vocab = model.stages[0].vocabulary
list_dfs = []
for num_cluster in range(n):
  df = pd.DataFrame({"word":vocab, "value": centers[num_cluster]})
  df.loc[:, "cluster"] = num_cluster
  list_dfs.append(df)
total_df = pd.concat(list_dfs, axis=1)

In [0]:
total_df.head()

Unnamed: 0,word,value,cluster,word.1,value.1,cluster.1,word.2,value.2,cluster.2,word.3,value.3,cluster.3,word.4,value.4,cluster.4,word.5,value.5,cluster.5,word.6,value.6,cluster.6,word.7,value.7,cluster.7,word.8,value.8,cluster.8,word.9,value.9,cluster.9
0,signal,0.011194,0,signal,0.011433,1,signal,0.034876,2,signal,0.034817,3,signal,0.024101,4,signal,0.078191,5,signal,0.027973,6,signal,0.009017,7,signal,0.03269,8,signal,0.197097,9
1,b,0.019192,0,b,0.012826,1,b,0.022773,2,b,0.013405,3,b,0.028266,4,b,0.023276,5,b,0.031169,6,b,0.015437,7,b,0.022842,8,b,0.04757,9
2,unit,0.029559,0,unit,0.017978,1,unit,0.048812,2,unit,0.01206,3,unit,0.049118,4,unit,0.047756,5,unit,0.028854,6,unit,0.012019,7,unit,0.021465,8,unit,0.040754,9
3,module,0.028371,0,module,0.020164,1,module,0.052605,2,module,0.018422,3,module,0.127349,4,module,0.042245,5,module,0.060328,6,module,0.035532,7,module,0.065659,8,module,0.066607,9
4,service,0.006376,0,service,0.020728,1,service,0.00997,2,service,0.027288,3,service,0.123728,4,service,0.05524,5,service,0.061019,6,service,0.085975,7,service,0.090517,8,service,0.014016,9


In [0]:
total_df.to_excel("cluster_centers.xlsx", header=True, index=False)

In [0]:
%fs cp file:/databricks/driver/cluster_centers.xlsx dbfs:/FileStore/cluster_centers.xlsx

Analysing the words with most value in each topic we can identify:
1 - Physics  
2 - Gaming  
3 - Photography  
4 - Mobile networks  
5 - Computers/cloud IT  
6 - Mobile networks  
7 - Networking  
8 - Video streaming  
9 - Mobile