In [1]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_full_preprocessed = pd.read_pickle("./df_full_preprocessed.pkl")

In [3]:
# use the  text transformer class to create two transformers for the textual and the numerical model
text_transformer = TfidfVectorizer(ngram_range=(1,1), min_df=int(len(df_full_preprocessed)**(1/4)), use_idf=True)
numerical_transformer = make_column_transformer((OneHotEncoder(handle_unknown="ignore"), ["gender","topic","sign"])\
                                                           , remainder=StandardScaler())


In [None]:
# create dataset
X_text = text_transformer.fit_transform(df_full_preprocessed["text_preprocessed"])
X_numerical = numerical_transformer.fit_transform(df_full_preprocessed.drop("text_preprocessed", axis=1))

In [None]:
validation_results = pd.DataFrame(data={'Features': [], 'Sum of squared distances': []})

In [None]:
# Validate the correct K of the text kmeans models by testing Ks between 2 and 20 and save the results
Sum_of_squared_distances = {}
K = range(2,40)
for k in tqdm(K):
    km = KMeans(n_clusters=k, max_iter=200)
    km = km.fit(X_text)
    Sum_of_squared_distances[k] = km.inertia_
        
new_row = {'Features':'Text', 'Sum of squared distances':Sum_of_squared_distances}
validation_results = validation_results.append(new_row, ignore_index=True)

In [None]:
# Validate the correct K of the numerical kmeans models by testing Ks between 2 and 30 and save the results
Sum_of_squared_distances = {}
K = range(2,40)
for k in tqdm(K):
    km = KMeans(n_clusters=k, max_iter=200)
    km = km.fit(X_numerical)
    Sum_of_squared_distances[k] = km.inertia_
        
new_row = {'Features':'Numerical', 'Sum of squared distances':Sum_of_squared_distances}
validation_results = validation_results.append(new_row, ignore_index=True)

In [None]:
# Print the validation results for each model
for i in validation_results.iterrows():
    print(i[1][0])
    try:
        dictionary = ast.literal_eval(i[1][1])
    except:
        dictionary = i[1][1]
    plt.plot(list(dictionary.keys()), list(dictionary.values()), 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title(f'Elbow Method for optimal k')
    plt.savefig(f'{i[1][0]}.png')
    plt.show()
    print('---------------------------------------------------')