## Set up

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, completeness_score, adjusted_rand_score, calinski_harabasz_score
import pickle

In [2]:
# create global variable to use for random seed as needed
random_seed = 466

## Load data

In [3]:
def load_unsupervised_tweet_df():
    """
    Load tweet dataframes (assumes filename structure)
    """
    filepath_in = f'../data/derived/tweets_unsupervised.csv'
    df = pd.read_csv(filepath_in)
    return df

def load_unsupervised_tweet_vectors(vector_name):
    """
    Load tweet vectors (assumes filename structure)
    """
    filepath_in = f'../data/derived/vectors/vector{vector_name}_unsupervised.npz'
    vectors = scipy.sparse.load_npz(filepath_in)
        
    return vectors

In [4]:
# load dataframe and vectors
tweet_df = load_unsupervised_tweet_df()
count_vectors = load_unsupervised_tweet_vectors('count')
tfidf_vectors = load_unsupervised_tweet_vectors('tfidf')

In [5]:
# find labelled records
labelled_idx = [i for i in range(len(tweet_df)) if tweet_df['label'][i] !=0]
labelled_tweet_df = tweet_df.iloc[labelled_idx]
labels = labelled_tweet_df['label']
labelled_count_vectors = count_vectors[labelled_idx, :]
labelled_tfidf_vectors = tfidf_vectors[labelled_idx, :]

## Fit clusters

In [11]:
def fit_clusters(num_clusters, vectors, random_seed, vector_name):
    """
    Fit KMeans with provided number of clusters and vectors
    """
    # initialize k-means
    kmeans = KMeans(n_clusters = num_clusters, random_state = random_seed)

    # train k-means
    kmeans.fit(vectors)
    
    # write to file
    filepath_out = f'../data/derived/models/kmeans_vector{vector_name}_clusters{num_clusters}.pkl'
    pickle.dump(kmeans, open(filepath_out, 'wb'))
    
    return kmeans

In [12]:
kmeans5count = fit_clusters(5, count_vectors, random_seed, 'count')

## Predict clusters

In [14]:
def predict_clusters(kmeans, vectors, df, dataset_name, vector_name):
    """
    Generate predictions with fit K-Means
    """
    # gereate predictions
    predictions = kmeans.predict(X = vectors)
    
    # create dataframe with record IDs, labels and predicted labels
    df = pd.DataFrame(data={'tweet_id':df['tweet_id'], 'label':df['label'], 'prediction':predictions})
    
    # write dataframe to file
    filepath_out = f'../data/derived/predictions/kmeans_vector{vector_name}_clusters{kmeans.n_clusters}_{dataset_name}.csv'
    df.to_csv(filepath_out, index=False)
    
    return predictions

In [16]:
all_count_predictions = predict_clusters(kmeans5count, count_vectors, tweet_df, 'all', 'count')
labelled_count_predictions = predict_clusters(kmeans5count, labelled_count_vectors, labelled_tweet_df, 'labelled', 'count')

## Evaluate clusters

In [19]:
def evaluate_clusters(labels, labelled_predictions, all_vectors, all_predictions, num_clusters, vector_name):
    """
    Calculate homogeneity, completeness, adusted Rand, and Calinski Harabasz scores
    """
    # initialize empty dictionary to store metrics
    metrics = dict()
    
    # calculate metrics
    homogeneity       = homogeneity_score(labels,   labelled_predictions)
    completeness      = completeness_score(labels,  labelled_predictions)
    adjusted_rand     = adjusted_rand_score(labels, labelled_predictions)
    calinski_harabasz = calinski_harabasz_score(all_vectors.toarray(), all_predictions)
    
    # store metrics
    metrics['homogeneity']       = homogeneity
    metrics['completeness']      = completeness
    metrics['adjusted_rand']     = adjusted_rand
    metrics['calinski_harabasz'] = calinski_harabasz
    
    # create dataframe
    metrics_df = pd.DataFrame(data=list(metrics.items()), columns=['metric','value'])
    
    # write dataframe to CSV
    filepath_out = f'../data/derived/performance/kmeans_vector{vector_name}_clusters{num_clusters}.csv'
    metrics_df.to_csv(path_or_buf = filepath_out, index = False)
    
    return metrics_df

In [20]:
evaluate_clusters(labels, labelled_count_predictions, count_vectors, all_count_predictions, 5, 'count')

Unnamed: 0,metric,value
0,homogeneity,0.288421
1,completeness,0.096357
2,adjusted_rand,0.089159
3,calinski_harabasz,101.269099
