## Set up

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, completeness_score, adjusted_rand_score, calinski_harabasz_score
import pickle

In [2]:
# create global variable to use for random seed as needed
random_seed = 466

## Load data

In [3]:
def load_unsupervised_tweet_df():
    """
    Load tweet dataframes (assumes filename structure)
    """
    filepath_in = f'../data/derived/tweets_unsupervised.csv'
    df = pd.read_csv(filepath_in)
    return df

def load_unsupervised_tweet_vectors(vector_name):
    """
    Load tweet vectors (assumes filename structure)
    """
    filepath_in = f'../data/derived/vectors/vector{vector_name}_unsupervised.npz'
    vectors = scipy.sparse.load_npz(filepath_in)
        
    return vectors

## Fit clusters

In [4]:
def fit_clusters(num_clusters, vectors, random_seed, vector_name):
    """
    Fit KMeans with provided number of clusters and vectors
    """
    # initialize k-means
    kmeans = KMeans(n_clusters = num_clusters, random_state = random_seed)

    # train k-means
    kmeans.fit(vectors)
    
    # write to file
    filepath_out = f'../data/derived/models/kmeans_vector{vector_name}_clusters{num_clusters}.pkl'
    pickle.dump(kmeans, open(filepath_out, 'wb'))
    
    return kmeans

## Predict clusters

In [5]:
def predict_clusters(kmeans, vectors, df, vector_name):
    """
    Generate predictions with fit K-Means
    """
    # gereate predictions
    predictions = kmeans.predict(X = vectors)
    
    # create dataframe with record IDs, labels and predicted labels
    df = pd.DataFrame(data={'tweet_id':df['tweet_id'], 'label':df['label'], 'prediction':predictions})
    
    # write dataframe to file
    filepath_out = f'../data/derived/predictions/kmeans_vector{vector_name}_clusters{kmeans.n_clusters}.csv'
    df.to_csv(filepath_out, index=False)
    
    return predictions

## Evaluate clusters

In [6]:
def evaluate_clusters(vectors, predictions, labels, subset_predictions, num_clusters, vector_name):
    """
    Calculate homogeneity, completeness, adusted Rand, and Calinski Harabasz scores
    """
    # initialize empty dictionary to store metrics
    metrics = dict()
    
    # calculate metrics
    calinski_harabasz = calinski_harabasz_score(vectors.toarray(), predictions)
    homogeneity       = homogeneity_score(labels,   subset_predictions)
    completeness      = completeness_score(labels,  subset_predictions)
    adjusted_rand     = adjusted_rand_score(labels, subset_predictions)
    
    # store metrics
    metrics['homogeneity']       = homogeneity
    metrics['completeness']      = completeness
    metrics['adjusted_rand']     = adjusted_rand
    metrics['calinski_harabasz'] = calinski_harabasz
    
    # create dataframe
    metrics_df = pd.DataFrame(data=list(metrics.items()), columns=['metric','value'])
    
    # write dataframe to CSV
    filepath_out = f'../data/derived/performance/kmeans_vector{vector_name}_clusters{num_clusters}.csv'
    metrics_df.to_csv(path_or_buf = filepath_out, index = False)
    
    return metrics_df

## Fit, predict, and evaluate in one function

In [7]:
def kmeans(vector_name, num_clusters, random_seed):
    
    # load dataframe and vectors
    tweet_df = load_unsupervised_tweet_df()
    vectors  = load_unsupervised_tweet_vectors(vector_name)
    
    # fit kmeans
    kmeans = fit_clusters(num_clusters, vectors, random_seed, vector_name)

    # generate predictions for all tweets
    predictions = predict_clusters(kmeans, vectors, tweet_df, vector_name)
    
    # get labels and labelled subset of predictions (used for metrics that require ground truth)
    subset_idx = [i for i in range(len(tweet_df)) if tweet_df['label'][i] != 0]
    subset_predictions = predictions[subset_idx]
    labels = tweet_df.iloc[subset_idx, :]['label']

    # evaluate clusters
    evaluate_clusters(vectors, predictions, labels, subset_predictions, num_clusters, vector_name)

## Fit models with multiple vectors and number of clusters

In [8]:
# initialize list of number of clusters
num_clusters_list = [i for i in range(2, 11)]

# initialize list of vector names
vector_name_list = ['count', 'tfidf']

In [9]:
# iterate over number of clusters
for num_clusters in num_clusters_list:
    
    # iterate over list of vector names
    for vector_name in vector_name_list:
        
        # fit K-Means
        kmeans(vector_name, num_clusters, random_seed)
        print(f'K-Means with num_clusters {num_clusters} and vector {vector_name} complete.')

K-Means with num_clusters 2 and vector count complete.
K-Means with num_clusters 2 and vector tfidf complete.
K-Means with num_clusters 3 and vector count complete.
K-Means with num_clusters 3 and vector tfidf complete.
K-Means with num_clusters 4 and vector count complete.
K-Means with num_clusters 4 and vector tfidf complete.
K-Means with num_clusters 5 and vector count complete.
K-Means with num_clusters 5 and vector tfidf complete.
K-Means with num_clusters 6 and vector count complete.
K-Means with num_clusters 6 and vector tfidf complete.
K-Means with num_clusters 7 and vector count complete.
K-Means with num_clusters 7 and vector tfidf complete.
K-Means with num_clusters 8 and vector count complete.
K-Means with num_clusters 8 and vector tfidf complete.
K-Means with num_clusters 9 and vector count complete.
K-Means with num_clusters 9 and vector tfidf complete.
K-Means with num_clusters 10 and vector count complete.
K-Means with num_clusters 10 and vector tfidf complete.
