In [2]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df_full_preprocessed = pd.read_pickle("../resource/df_full_preprocessed.pkl")

In [4]:
# use the  text transformer class to create two transformers for the textual and the numerical model
text_transformer = TfidfVectorizer(ngram_range=(1,1), min_df=int(len(df_full_preprocessed)**(1/4)), use_idf=True)
numerical_transformer = make_column_transformer((OneHotEncoder(handle_unknown="ignore"), ["gender","topic","sign"])\
                                                           , remainder=StandardScaler())


In [5]:
# create dataset
X_text = text_transformer.fit_transform(df_full_preprocessed["text_preprocessed"])
X_numerical = numerical_transformer.fit_transform(df_full_preprocessed.drop("text_preprocessed", axis=1))

## Find the best k for each dataset 

In [None]:
validation_results = pd.DataFrame(data={'Features': [], 'Sum of squared distances': []})

In [None]:
# Validate the correct K of the text kmeans models by testing Ks between 2 and 20 and save the results
Sum_of_squared_distances = {}
K = range(2,40)
for k in tqdm(K):
    km = KMeans(n_clusters=k, max_iter=200)
    km = km.fit(X_text)
    Sum_of_squared_distances[k] = km.inertia_
        
new_row = {'Features':'Text', 'Sum of squared distances':Sum_of_squared_distances}
validation_results = validation_results.append(new_row, ignore_index=True)

In [None]:
# Validate the correct K of the numerical kmeans models by testing Ks between 2 and 30 and save the results
Sum_of_squared_distances = {}
K = range(2,40)
for k in tqdm(K):
    km = KMeans(n_clusters=k, max_iter=200)
    km = km.fit(X_numerical)
    Sum_of_squared_distances[k] = km.inertia_
        
new_row = {'Features':'Numerical', 'Sum of squared distances':Sum_of_squared_distances}
validation_results = validation_results.append(new_row, ignore_index=True)

In [None]:
# Print the validation results for each model
for i in validation_results.iterrows():
    print(i[1][0])
    try:
        dictionary = ast.literal_eval(i[1][1])
    except:
        dictionary = i[1][1]
    plt.plot(list(dictionary.keys()), list(dictionary.values()), 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title(f'Elbow Method for optimal k')
    plt.savefig(f'{i[1][0]}.png')
    plt.show()
    print('---------------------------------------------------')

## Create final clusters 

In [6]:
# fit with optimal k
km_numerical = KMeans(n_clusters=3, max_iter=200, random_state=42)
km_numerical.fit(X_numerical)

# add column to dataset
df_full_preprocessed["numerical_cluster"] = km_numerical.predict(X_numerical)

In [7]:
# fit with optimal k
km_text = KMeans(n_clusters=5, max_iter=200, random_state=42)
km_text.fit(X_text)

# add column to dataset
df_full_preprocessed["text_cluster"] = km_text.predict(X_text)

In [65]:
df_full_preprocessed.to_pickle("clustering_dataset.pkl")

## Create Dataset Profile 

In [43]:
from pandas_profiling import ProfileReport

In [34]:
# Make profile for each numerical cluster
for k in df_full_preprocessed["text_cluster"].unique():
    # draw a subsample of 1k elements for better performance
    df = df_full_preprocessed[df_full_preprocessed["text_cluster"] == k].sample(1_000).drop("numerical_cluster", axis=1)
    profile = ProfileReport(df, title=f'Cluster {k} Text Report', explorative=True)
    profile.to_file(f"Cluster {k} Text Report.html")


HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [57]:
# Make profile for each numerical cluster
for k in df_full_preprocessed["numerical_cluster"].unique():
    # draw a subsample of 1k elements for better performance
    try:
        df = df_full_preprocessed[df_full_preprocessed["numerical_cluster"] == k].sample(1_000).drop("text_cluster", axis=1)

    except:
        df = df_full_preprocessed[df_full_preprocessed["numerical_cluster"] == k].drop("text_cluster", axis=1)
 
    profile = ProfileReport(df, title=f'Cluster {k} Numerical Report', explorative=True)
    profile.to_file(f"Cluster {k} Numerical Report.html")


HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=37.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




## Export Pipeline 

In [10]:
clustering_pipeline = pd.Series([text_transformer, numerical_transformer, km_text, km_numerical])

In [11]:
clustering_pipeline.to_pickle("../resource/Pipelines/ClusteringPipeline.pkl")