In [1]:
import os
import pandas as pd
import sklearn.cluster as skclust
from sklearn import preprocessing
import numpy as np
import pickle
from sklearn import metrics

data_path = os.path.abspath('../') + '/data/davos/' # Change this to path to your data folder.
file_name = 'user_cs_profile' # Name of the CSV file (input).
csv_ext = '.csv'

k = 2
num_seeds = 5

file_path = os.path.join(data_path, file_name + csv_ext)
print("File path: ", file_path)

data = pd.read_csv(file_path, index_col=0)
X = data.as_matrix()
X = preprocessing.scale(X, axis=0)
print("Data matrix size: ", X.shape)

sil_scores = np.zeros(num_seeds)
for seed in range(num_seeds):
    # Get k-means clusters
    random_state = np.random.randint(1000)
    est = skclust.KMeans(n_clusters = k, random_state=random_state)
    est.fit(X)
    labels = est.labels_
    
    # Get Silhouette scores for evaluation (between 0 and 1; the higher the better)
    sil_scores[seed] = metrics.silhouette_score(X, labels, metric='euclidean')
    
    # Write labels to a csv file
    data['Cluster'] = labels
    output_file_name = file_name + '_clusters' + '_' + str(random_state) # Output file name.
    data.to_csv(data_path + output_file_name + csv_ext)

# Average and standard deviation of Silhouette score (over num_seeds runs)
print("Averages: ", np.average(sil_scores))
print("Standard Deviation: ", np.std(sil_scores))

File path:  /Users/alankar/Documents/cmu/code/prelim-analysis/data/davos/user_cs_profile.csv
Data matrix size:  (64, 6)




Averages:  0.24619138509921284
Standard Deviation:  0.0072093096757323925


In [2]:
output_file_name = 'final_clusters.csv'
cluster_file_name = 'clusters_full.pkl'

tid_col_name = 'Transcript ID'
cluster_col_name = 'Final Cluster'

df = pd.read_csv(data_path + output_file_name, index_col=None)
cluster_map = dict(zip(df[tid_col_name], df[cluster_col_name]))

with open(data_path + cluster_file_name, 'wb') as f:
    pickle.dump(cluster_map, f, pickle.HIGHEST_PROTOCOL)