### Import Libraries

In [198]:
import re
import nltk
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from pathlib import Path
from sentence_transformers import SentenceTransformer, util
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from unidecode import unidecode
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from scipy.spatial import distance_matrix, distance
from sklearn.feature_selection import RFE
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from sklearn_extra.cluster import KMedoids

sns.set_theme()

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Functions

Here you can find all the necessary functions to run the second part of this notebook.

### Functions to Read and Pre-process the information

In [2]:
def load_data(dir, index = 0, label='', num_words=7):
    """
    Preprocess all data that is in input_path and stores it in output_path

        Parameters:
            input_path (str): String with the input path
            output_path (str): String with the path where it will save the preprocess data

        Return:
            Dataframe
    """
    file_list = os.listdir(dir)
    df_embeddings = []
    df_cvs = []
    for file in file_list:
        path = os.path.join(dir, file)
        if os.path.isfile(path):
            with open (path, 'r') as tmp:
                embeddings, text = pre_process(tmp.read(), label, index, num_words)
            if embeddings:
                df_embeddings.extend(embeddings)
                df_cvs.append({
                    'label': label,
                    'index': index,
                    'text': text
                })
                index += 1
                print(f'Processing cv: {index}')
        else:
            result, result2, current_index = load_data(path, index, file, num_words)
            index = current_index
            if result:
                df_embeddings.extend(result)
                df_cvs.extend(result2)

    return df_embeddings, df_cvs, index


def pre_process(text, label, index, num_words=7):
    """
    Takes a text as an input and returns the same text with the following processings:
    -Remove characters that are not letters
    -Remove English stopwords
    -Applies lemmatization to the text
    """
    regex = re.compile('[^a-zA-Z]')
    text = regex.sub(' ', text)
    text = text.lower()
    
    words = set(nltk.corpus.words.words())
    stop_words = nltk.corpus.stopwords.words('english')
    wnl = WordNetLemmatizer()
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    text = " ".join(wnl.lemmatize(word) for word in nltk.wordpunct_tokenize(text)
                    if word in words and word not in stop_words and len(word) > 1)
    
    processed_text = text

    text = text.split(' ')
    embeddings = []
    length = math.ceil(len(text)/num_words)
    for i in range(0, len(text), num_words):
        sentence = ' '.join(text[i:i+num_words])
        embedding = model.encode(sentence)
        embeddings.append({
            'label': label,
            'index': index,
            'sentence': sentence,
            'embedding': embedding,
            'len': length
        })
    return embeddings, processed_text


def extract_features(df, label_column, index_column, sentence_column):
    """
    Extract features that are easy to obtain from the df
    """
    # We get the tf-idf
    documents = []
    df.groupby(label_column).apply(lambda x: documents.append(" ".join(x[sentence_column].to_list())), include_groups=False)

    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(documents).toarray()
    tfidf = pd.DataFrame(tfidf, index=df.groupby('label').groups.keys())
    print(tfidf.shape)

    # We get the word freq
    corpus = []
    df.groupby(index_column).apply(lambda x: corpus.append(" ".join(x[sentence_column].to_list())), include_groups=False)

    vectorizer = CountVectorizer()
    wf = vectorizer.fit_transform(corpus).toarray()
    print(wf.shape)

    return tfidf, wf, vectorizer
    

### Functions for Cluster Analysis

In [3]:
def get_top_k_cluster_frequencies(df, groupby, cluster_column, k=5):
    """
    Returns the frequency of clusters grouped by groupby
    """
    batches = []
    df_group = df.groupby(groupby)
    df_group.apply(lambda x: batches.append(x[cluster_column].value_counts().head(k)), include_groups=False)
    return batches


def get_appereances(freq_list):
    """
    Get the number of appereances of each cluster in all the lists
    """
    appereances = {}
    for batch in freq_list:
        for number in batch.index.values:
            if number in appereances.keys():
                appereances[number] += 1
            else:
                appereances[number] = 1
        # [appereances[number] += 1 for number in batch.index.values]
        # print(batch.index.values)
    return pd.Series(appereances)


def get_sentences_in_cluster(df, sentence_column, cluster_column, cluster):
    """
    Returns the tokens that are related to one cluster
    """
    return df[df[cluster_column] == cluster][sentence_column]

### Other Useful Functions

In [216]:
def convert_for_training(df, df_norm, num_clusters, kmeans):
    """
    Extract the features and generates a matrix with cv as the rows and frequency of cluster as the columns.
    WE NEED TO IMPROVE THIS FUNCTION
    """
    # print('#########################')
    # print(df)
    keys = df['index'].unique()
    X = {k: np.zeros(num_clusters) for k in keys}

    for _, row in df.iterrows():
        # print(type(row['index']))
        # print(f'({row['index']}, {row['cluster']})')
        # print(X[row['index']])
        # Select nearby clusters
        # print(f'Nearby Clusters: {len(nearby_clusters)}')
        # print(nearby_clusters)
        # nearby_clusters = distance[row['cluster'],:] < .4
        X[row['index']][row['cluster']] += 1
    
    for key, value in X.items():
        X[key] = (X[key] / df_norm.iloc[key])
        # value[-1] = df_norm.iloc[key]/100
    # print('################################')
    # print('Dict X')
    # print(X)
    # print(X)
    return pd.DataFrame(X).T


def get_silhouette_score(df_embeddings, train_index):
    df_train = df_embeddings[df_embeddings['index'].isin(train_index)].copy()

    k_range = range(900, 910)
    scores = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=0, n_init=5).fit(df_train['embedding'].to_list())
        labels = kmeans.labels_
        score = silhouette_score(df_train['embedding'].to_list(), labels)
        scores.append(score)
    
    plt.figure(figsize=(10, 6))
    plt.plot(k_range, scores, marker='o')
    plt.title('Silhouette Method for Optimal k')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.show()



def create_model(df_embeddings, df_cvs, train_index, classifier, n_clusters=30):
    """
    This function trains a model with the df received as input and returns a model as output
    """
    # We get the clusters for this dataframe
    X_train = df_embeddings[df_embeddings['index'].isin(train_index)].copy()
    y_train = df_cvs[df_cvs['index'].isin(train_index)].copy()['label']

    # kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=5).fit(X_train['embedding'].to_list())
    # X_train['cluster'] = kmeans.labels_
    kmeans = KMedoids(n_clusters=n_clusters, random_state=0, metric='cosine').fit(X_train['embedding'].to_list())
    X_train['cluster'] = kmeans.labels_
    clusters = util.community_detection(X_train['embedding'].to_list(), min_community_size=5, threshold=0.5)
    for i, cluster in enumerate(clusters):
        print(f'Cluster {i}')
        for sentence_id in cluster:
            print(f'Sentence: {X_train['sentence'].iloc[sentence_id]}')
    # cluster_freq = get_top_k_cluster_frequencies(X_train[X_train['label'] == 0], groupby='index', cluster_column='cluster')
    # director_appereances = get_appereances(cluster_freq)
    # director_appereances = director_appereances.sort_values(ascending=False).head(5)
    # print('Director Appereances')
    # print(director_appereances)

    # cluster_freq = get_top_k_cluster_frequencies(X_train[X_train['label'] == 2], groupby='index', cluster_column='cluster')
    # gerente_appereances = get_appereances(cluster_freq)
    # gerente_appereances = gerente_appereances.sort_values(ascending=False).head(5)
    # print('Gerente Appereances')
    # print(gerente_appereances)

    # cluster_freq = get_top_k_cluster_frequencies(X_train[X_train['label'] == 1], groupby='index', cluster_column='cluster')
    # especialista_appereances = get_appereances(cluster_freq)
    # especialista_appereances = especialista_appereances.sort_values(ascending=False).head(5)
    # print('Especialista Appereances')
    # print(especialista_appereances)

    # Let's get the distance between clusters
    # centers = kmeans.cluster_centers_
    # print(distance_matrix(centers, centers))
    
    # We convert the dataframe for training and remove the values that are not in train_index
    X = convert_for_training(X_train, X_train['len'], n_clusters, kmeans)
    X.to_csv('Clusters.csv')

    estimator = RandomForestClassifier()
    selector = RFE(estimator, n_features_to_select=25, step=1)
    selector = selector.fit(X, y_train)
    mask = list(selector.support_)

    X_pro = X.loc[:, mask]
    X_points = np.array(X_pro)

    # We apply LDA
    # clf = LDA()
    # lda_model = clf.fit(np.array(X_pro), y_train)
    # X_points = lda_model.transform(np.array(X_pro))

    # target_names = np.array([0, 1, 2])
    # for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    #     plt.scatter(X_points[y_train == i, 0], X_points[y_train == i, 1], c=c, label=target_name)
    #     plt.legend()
    #     plt.title('LDA')
    
    # plt.show()

    # Train the classifier
    lda_model = 2
    clf_model = classifier.fit(X_points, y_train)

    return kmeans, lda_model, clf_model, mask


def test_model(df_embeddings, df_cvs, test_index, kmeans, lda_model, clf_model, mask, n_clusters):
    """
    This function evaluates a model
    """
    # We get the clusters for this dataframe
    # print('Getting distance to clusters')
    # print(test_index[0])
    # centers = kmeans.cluster_centers_
    # testing = np.array(df_embeddings[df_embeddings['index'] == test_index[0]]['embedding'])
    # print(centers.shape)
    # print(testing.shape)
    # print(testing)
    # print(distance_matrix(centers, testing))

    # for center in centers:
    #     center
    # How close are these embeddings to the centrois
    # centroids = kmeans.cluster_centers_
    # print('Distance between each embedding with the clusters')
    # for index, row in df_embeddings.iterrows():
    #     print(f'Values for embedding {index}')
    #     it = 0
    #     for centroid in centroids:
    #         print(f'Iterator {it}, Distance: {distance.correlation(row['embedding'], centroid)}')
    #         it += 1

    X_test = df_embeddings[df_embeddings['index'].isin(test_index)].copy()
    y_test = df_cvs[df_cvs['index'].isin(test_index)].copy()['label']

    kmeans_prediction = kmeans.predict(X_test['embedding'].to_list())
    X_test['cluster'] = kmeans_prediction

    # We convert the dataframe for training and remove the values that are not in train_index
    X = convert_for_training(X_test, X_test['len'], n_clusters, kmeans)
    # print('Clusters prediction')
    # print(X.iloc[0])
    X_pro = X.loc[:, mask]
    X_points = np.array(X_pro)

    # We apply LDA
    # X_points = lda_model.transform(np.array(X_pro))
    
    # target_names = np.array([0, 1, 2])
    # for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    #     plt.scatter(X_points[y_test == i, 0], X_points[y_test == i, 1], c=c, label=target_name)
    #     plt.legend()
    #     plt.title('LDA')
    
    # plt.show()

    # Test the classifier
    # print('#################################################')
    # print('Real Ground Values:')
    print(y_test)
    # print('Predict Values')
    print(clf_model.predict_proba(X_points))
    print(clf_model.predict(X_points))
    score = clf_model.score(X_points, y_test)

    return score
    


### AutoEncoder functions

In [148]:
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model
from sklearn.metrics import accuracy_score, precision_score, recall_score
from keras import regularizers

In [162]:
class AutoEncoder(Model):
    def __init__(self, latent_dim):
        super(AutoEncoder, self).__init__()
        self.latent_dim = latent_dim
        self.encoder = tf.keras.Sequential([
            layers.Dense(256, activation='relu', kernel_regularizer=regularizers.L2(0.001)),
            layers.Dropout(0.2),
            layers.Dense(128, activation='relu', kernel_regularizer=regularizers.L2(0.001)),
            layers.Dropout(0.2),
            layers.Dense(64, activation='relu', kernel_regularizer=regularizers.L2(0.001)),
            layers.Dropout(0.2),
            layers.Dense(latent_dim, activation='relu', kernel_regularizer=regularizers.L2(0.001))
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(64, activation='relu', kernel_regularizer=regularizers.L2(0.001)),
            layers.Dropout(0.2),
            layers.Dense(128, activation='relu', kernel_regularizer=regularizers.L2(0.001)),
            layers.Dropout(0.2),
            layers.Dense(256, activation='relu', kernel_regularizer=regularizers.L2(0.001)),
            layers.Dropout(0.2),
            layers.Dense(384, activation='sigmoid', kernel_regularizer=regularizers.L2(0.001))
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
def build_autoencoder(latent_dim):
    autoencoder = AutoEncoder(latent_dim)
    autoencoder.compile(optimizer = 'adam', loss=losses.MeanSquaredError())
    return autoencoder

## Analysis

### Read and Explore Data

We will read the data. For that we will use the load_data function. This function already implements some processings to the data such as removing non English characters, removing numbers, removing stop words and applying lemmatization. 

In [5]:
df_embeddings, df_cvs, _ = load_data('D:/Github/CVClassifier/translated_files', num_words=7)
df_embeddings = pd.DataFrame(df_embeddings)
df_embeddings.head()

Processing cv: 1
Processing cv: 2
Processing cv: 3
Processing cv: 4
Processing cv: 5
Processing cv: 6
Processing cv: 7
Processing cv: 8
Processing cv: 9
Processing cv: 10
Processing cv: 11
Processing cv: 12
Processing cv: 13
Processing cv: 14
Processing cv: 15
Processing cv: 16
Processing cv: 17
Processing cv: 18
Processing cv: 19
Processing cv: 20
Processing cv: 21
Processing cv: 22
Processing cv: 23
Processing cv: 24
Processing cv: 25
Processing cv: 26
Processing cv: 27
Processing cv: 28
Processing cv: 29
Processing cv: 30
Processing cv: 31
Processing cv: 32
Processing cv: 33
Processing cv: 34
Processing cv: 35


Unnamed: 0,label,index,sentence,embedding,len
0,director,0,present phone mail lin ay marital status,"[-0.14330845, 0.024291335, 0.06727925, -0.0175...",22
1,director,0,married date birth place birth experience new,"[-0.0119974855, 0.029411308, 0.038155872, 0.04...",22
2,director,0,business manager coca cola identify help increase,"[-0.038613867, -0.01384318, 0.0017367965, -0.0...",22
3,director,0,relevance dairy category within company develo...,"[0.05912432, -0.054247126, -0.024985064, 0.019...",22
4,director,0,guarantee compliance main volume continuous im...,"[-0.030106345, -0.028742226, 0.066614576, -0.1...",22


In [6]:
df_embeddings.tail()

Unnamed: 0,label,index,sentence,embedding,len
1697,gerente,34,education national polytechnic institute profe...,"[-0.11809396, -0.0056829993, 0.010561999, -0.0...",22
1698,gerente,34,social administrative university graduated ind...,"[-0.048630904, -0.020485165, -0.033911493, 0.0...",22
1699,gerente,34,eight without delay center scientific technolo...,"[-0.052343454, -0.03339269, -0.0046027955, 0.0...",22
1700,gerente,34,bachelor degree technological bachelor industr...,"[-0.009750923, 0.018651687, 0.02723899, -0.008...",22
1701,gerente,34,intermediate resume link source,"[-0.11952475, -0.03218225, -0.045175284, 0.068...",22


In [7]:
df_cvs = pd.DataFrame(df_cvs)
df_cvs.head()

Unnamed: 0,label,index,text
0,director,0,present phone mail lin ay marital status marri...
1,director,1,director supply chain san col san de education...
2,director,2,mobile phone yahoo carrier summary senior exec...
3,director,3,link competence profile leadership experience ...
4,director,4,general director commercial vice president mar...


We encode the labels to have something we can try on

In [8]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

df_embeddings['label'] = label_encoder.fit_transform(df_embeddings['label'])
df_cvs['label'] = label_encoder.fit_transform(df_cvs['label'])

In [9]:
df_embeddings.head()

Unnamed: 0,label,index,sentence,embedding,len
0,0,0,present phone mail lin ay marital status,"[-0.14330845, 0.024291335, 0.06727925, -0.0175...",22
1,0,0,married date birth place birth experience new,"[-0.0119974855, 0.029411308, 0.038155872, 0.04...",22
2,0,0,business manager coca cola identify help increase,"[-0.038613867, -0.01384318, 0.0017367965, -0.0...",22
3,0,0,relevance dairy category within company develo...,"[0.05912432, -0.054247126, -0.024985064, 0.019...",22
4,0,0,guarantee compliance main volume continuous im...,"[-0.030106345, -0.028742226, 0.066614576, -0.1...",22


In [10]:
df_embeddings.tail()

Unnamed: 0,label,index,sentence,embedding,len
1697,2,34,education national polytechnic institute profe...,"[-0.11809396, -0.0056829993, 0.010561999, -0.0...",22
1698,2,34,social administrative university graduated ind...,"[-0.048630904, -0.020485165, -0.033911493, 0.0...",22
1699,2,34,eight without delay center scientific technolo...,"[-0.052343454, -0.03339269, -0.0046027955, 0.0...",22
1700,2,34,bachelor degree technological bachelor industr...,"[-0.009750923, 0.018651687, 0.02723899, -0.008...",22
1701,2,34,intermediate resume link source,"[-0.11952475, -0.03218225, -0.045175284, 0.068...",22


In [11]:
df_cvs.head()

Unnamed: 0,label,index,text
0,0,0,present phone mail lin ay marital status marri...
1,0,1,director supply chain san col san de education...
2,0,2,mobile phone yahoo carrier summary senior exec...
3,0,3,link competence profile leadership experience ...
4,0,4,general director commercial vice president mar...


In [12]:
df_cvs.tail()

Unnamed: 0,label,index,text
30,2,30,maria summary experience education training in...
31,2,31,address la col inn home phone cell phone outlo...
32,2,32,chief technology architecture innovation compu...
33,2,33,pablo sur city zip code academic background ma...
34,2,34,sa male hidalgo city ing yahoo professional ex...


In [16]:
df_embeddings['embedding'].apply(lambda x: x.shape)

0       (384,)
1       (384,)
2       (384,)
3       (384,)
4       (384,)
         ...  
1697    (384,)
1698    (384,)
1699    (384,)
1700    (384,)
1701    (384,)
Name: embedding, Length: 1702, dtype: object

### Model

In [None]:
x_train = np.array(df_embeddings['embedding'].to_list())
autoencoder = build_autoencoder(32)
autoencoder.fit(x_train, x_train, epochs=10, shuffle=True, )

In [168]:
from sklearn import preprocessing

kmeans = KMeans().fit(preprocessing.normalize(np.array(df_embeddings['embedding'].to_list())))
kmeans.labels_

array([6, 6, 2, ..., 6, 6, 6])

In [217]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True)

indexes = df_cvs['index']
labels = df_cvs['label']

mean_score = 0
for i, (train_index, test_index) in enumerate(skf.split(indexes, labels)):
    # print('Train index')
    # print(train_index)
    # print('Test index')
    # print(test_index)
    # print('Silhouette score')
    # get_silhouette_score(df_embeddings, train_index)
    print(f'Test {i}')
    kmeans, lda_model, clf_model, mask = create_model(df_embeddings, df_cvs, train_index, RandomForestClassifier(), n_clusters=30)
    score = test_model(df_embeddings, df_cvs, test_index, kmeans=kmeans, lda_model=lda_model, clf_model=clf_model, mask=mask, n_clusters=30)
    print(score)
    # mean_score += score
    print(f'Test {i}')
    # df_train = df_embeddings[df_embeddings['index'].isin(train_index)].copy()
    # df_test = df_embeddings[df_embeddings['index'].isin(test_index)].copy()

    # x_train = np.array(df_train['embedding'].to_list())
    # x_test = np.array(df_test['embedding'].to_list())

    # autoencoder = build_autoencoder(2)
    # autoencoder.fit(x_train, x_train, epochs=15, shuffle=True, validation_data=(x_test, x_test))    

    # encoded_data_train = autoencoder.encoder(x_train).numpy()
    # df_train['x'] = encoded_data_train[:,0]
    # df_train['y'] = encoded_data_train[:,1]
    # print(encoded_data_train.shape)

    # encoded_data = autoencoder.encoder(x_test).numpy()
    # decoded_data =autoencoder.decoder(encoded_data).numpy()

    # # print(x_test[0])
    # # print(encoded_data[2])
    # max_index = np.argmax(encoded_data_train[2])
    # print(max_index)
    # print(encoded_data[0])
    # # print(decoded_data[0])

    break
    

print('Mean Score')
print(mean_score/5)

train_index = [ 1,  2,  3,  4,  5,  7,  8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33]
test_index = [ 0,  6,  9, 17, 20, 32, 34]


Test 0
Cluster 0
Sentence: operation corresponding quality area compliance quality engineer
Sentence: august quality engineer ensure regulatory compliance applicable
Sentence: continuous improvement comply according management system quality
Sentence: quality assurance inspector period ensure compliance inspection
Sentence: guarantee compliance main volume continuous improvement business
Sentence: legal compliance quality professional process improvement quality
Sentence: implementation management change safety quality certification international
Sentence: within area support quality product analysis finished
Sentence: operation optimization well quality control maintenance carried
Sentence: compliance safety hygiene safety quality throughout process
Sentence: implementation improvement ensure compliance within provided main
Sentence: risk analysis participate improvement quality control corrective
Sentence: quality period may patient safety officer quality
Sentence: quality control im

Let's analyze clusters

In [129]:
df_all = df_embeddings.copy()
df_all.shape

train_index = [ 1,  2,  3,  4,  5,  7,  8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33]
test_index = [ 0,  6,  9, 17, 20, 32, 34]

In [130]:
kmeans = KMeans(n_clusters=30, random_state=0, n_init='auto').fit(df_all['embedding'].to_list())
df_all['cluster'] = kmeans.labels_

In [131]:
df_all.head()

Unnamed: 0,label,index,sentence,embedding,len,cluster
0,0,0,present phone mail lin ay marital status,"[-0.14330845, 0.024291335, 0.06727925, -0.0175...",22,10
1,0,0,married date birth place birth experience new,"[-0.0119974855, 0.029411308, 0.038155872, 0.04...",22,10
2,0,0,business manager coca cola identify help increase,"[-0.038613867, -0.01384318, 0.0017367965, -0.0...",22,6
3,0,0,relevance dairy category within company develo...,"[0.05912432, -0.054247126, -0.024985064, 0.019...",22,18
4,0,0,guarantee compliance main volume continuous im...,"[-0.030106345, -0.028742226, 0.066614576, -0.1...",22,20


In [132]:
df_train = df_embeddings[df_embeddings['index'].isin(train_index)].copy()
df_test = df_embeddings[df_embeddings['index'].isin(test_index)].copy()

In [133]:
kmeans = KMeans(n_clusters=30, random_state=0, n_init='auto').fit(df_train['embedding'].to_list())
df_train['cluster'] = kmeans.labels_

In [134]:
df_test['cluster'] = kmeans.predict(df_test['embedding'].to_list())
df_test

Unnamed: 0,label,index,sentence,embedding,len,cluster
0,0,0,present phone mail lin ay marital status,"[-0.14330845, 0.024291335, 0.06727925, -0.0175...",22,27
1,0,0,married date birth place birth experience new,"[-0.0119974855, 0.029411308, 0.038155872, 0.04...",22,27
2,0,0,business manager coca cola identify help increase,"[-0.038613867, -0.01384318, 0.0017367965, -0.0...",22,3
3,0,0,relevance dairy category within company develo...,"[0.05912432, -0.054247126, -0.024985064, 0.019...",22,3
4,0,0,guarantee compliance main volume continuous im...,"[-0.030106345, -0.028742226, 0.066614576, -0.1...",22,24
...,...,...,...,...,...,...
1697,2,34,education national polytechnic institute profe...,"[-0.11809396, -0.0056829993, 0.010561999, -0.0...",22,27
1698,2,34,social administrative university graduated ind...,"[-0.048630904, -0.020485165, -0.033911493, 0.0...",22,27
1699,2,34,eight without delay center scientific technolo...,"[-0.052343454, -0.03339269, -0.0046027955, 0.0...",22,27
1700,2,34,bachelor degree technological bachelor industr...,"[-0.009750923, 0.018651687, 0.02723899, -0.008...",22,27


In [135]:
print(df_all[df_all['index'] == 6]['cluster'].to_list())
print(df_test[df_test['index'] == 6]['cluster'].to_list())

[24, 14, 17, 2, 17, 17, 14, 24, 15, 23, 17, 17, 26, 17, 23, 17, 26, 6, 17, 3, 7, 17, 17, 17, 2, 17, 17, 7, 17, 17, 14, 12, 17, 17, 17, 14, 17, 12, 1, 13, 9, 0, 14, 5, 5, 0, 8]
[20, 2, 2, 28, 21, 2, 20, 20, 1, 15, 2, 21, 14, 14, 28, 2, 14, 3, 2, 14, 11, 2, 2, 2, 2, 2, 2, 11, 15, 14, 20, 21, 2, 2, 3, 3, 21, 21, 2, 9, 6, 27, 16, 16, 5, 1, 4]


In [142]:
kmeans.cluster_centers_.shape

(30, 384)

In [158]:
embd = df_test[df_test['index'] == 6]['embedding'].to_list()

In [160]:
pd.DataFrame(distance_matrix(embd, kmeans.cluster_centers_)).to_csv('Cluster-Distance.csv')

In [26]:
embeddings = df_test[df_test['index'] == 0]['embedding'].to_list()

In [27]:
distance = pd.DataFrame(distance_matrix(kmeans.cluster_centers_, embeddings))
distance.to_csv('Distance.csv')

In [28]:
get_sentences_in_cluster(df_train, 'sentence', 'cluster', 3).to_csv('cluster_3.csv')

In [24]:
df_test[df_test['index'] == 0]

Unnamed: 0,label,index,sentence,embedding,len,cluster
0,0,0,present phone mail lin ay marital status,"[-0.14330845, 0.024291335, 0.06727925, -0.0175...",22,27
1,0,0,married date birth place birth experience new,"[-0.0119974855, 0.029411308, 0.038155872, 0.04...",22,27
2,0,0,business manager coca cola identify help increase,"[-0.038613867, -0.01384318, 0.0017367965, -0.0...",22,3
3,0,0,relevance dairy category within company develo...,"[0.05912432, -0.054247126, -0.024985064, 0.019...",22,3
4,0,0,guarantee compliance main volume continuous im...,"[-0.030106345, -0.028742226, 0.066614576, -0.1...",22,24
5,0,0,supply chain conjunction seeking increase prof...,"[-0.017119452, 0.010646724, 0.042245988, -0.06...",22,2
6,0,0,innovation dairy together participation proces...,"[0.08193664, -0.100071564, -0.021429706, -0.03...",22,11
7,0,0,stage potential dairy marketing manager group ...,"[0.060638957, -0.11005124, 0.008017912, 0.0664...",22,20
8,0,0,short term guarantee compliance evaluation dev...,"[-0.08696697, 0.037862867, 0.027878024, -0.034...",22,24
9,0,0,term guarantee different area responsible mark...,"[0.02286979, -0.088071205, -0.05835017, -0.029...",22,3


In [25]:
df_test[df_test['index'] == 0]['cluster'].value_counts()

cluster
3     4
24    4
20    4
16    3
27    2
11    2
2     1
28    1
15    1
Name: count, dtype: int64

In [35]:
cluster_freq = get_top_k_cluster_frequencies(df_train[df_train['label'] == 0], groupby='index', cluster_column='cluster')
director_appereances = get_appereances(cluster_freq)
director_appereances = director_appereances.sort_values(ascending=False).head(10)
print('Director Appereances')
print(director_appereances)

cluster_freq = get_top_k_cluster_frequencies(df_train[df_train['label'] == 2], groupby='index', cluster_column='cluster')
gerente_appereances = get_appereances(cluster_freq)
gerente_appereances = gerente_appereances.sort_values(ascending=False).head(10)
print('Gerente Appereances')
print(gerente_appereances)

cluster_freq = get_top_k_cluster_frequencies(df_train[df_train['label'] == 1], groupby='index', cluster_column='cluster')
especialista_appereances = get_appereances(cluster_freq)
especialista_appereances = especialista_appereances.sort_values(ascending=False).head(10)
print('Especialista Appereances')
print(especialista_appereances)

Director Appereances
3     4
11    4
21    3
28    3
16    3
20    3
1     2
19    2
14    2
2     2
dtype: int64
Gerente Appereances
28    5
8     4
24    4
2     3
13    3
1     3
4     3
27    3
20    3
22    2
dtype: int64
Especialista Appereances
1     8
27    7
0     5
18    5
8     4
6     3
12    3
9     2
21    2
25    2
dtype: int64


In [27]:
cluster_freq = get_top_k_cluster_frequencies(df_all[df_all['label'] == 0], groupby='index', cluster_column='cluster')
director_appereances = get_appereances(cluster_freq)
director_appereances = director_appereances.sort_values(ascending=False).head(5)
print('Director Appereances')
print(director_appereances)

Director Appereances
14    7
18    5
24    5
3     4
6     4
dtype: int64


In [28]:
df_all[df_all['index'] == 0]['cluster'].value_counts()

cluster
18    6
20    4
10    2
3     2
14    2
5     2
6     1
16    1
26    1
12    1
Name: count, dtype: int64

In [34]:
pd.DataFrame(distance_matrix(kmeans.cluster_centers_, kmeans.cluster_centers_)).to_csv('CentersDistanceMatrix.csv')

In [42]:
pd.DataFrame(silhouette_samples(df_train['embedding'].to_list(), kmeans.labels_)).to_csv('Scores.csv')