## Clustering Documents Demo
Reference: [Dipanjan Sarkar, Text Analytics with Python](https://www.apress.com/gb/book/9781484223871) 

### Define the imports

In [28]:
from sklearn.cluster import KMeans
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
import random
from matplotlib.font_manager import FontProperties
import pandas as pd
import numpy as np
import sys
from os import path
sys.path.append(r'..\ClusteringDocumentsDemo')
from normalization import normalize_corpus
from utils import build_feature_matrix

### Specify a random seed

In [29]:
np.random.seed(100)

###  Define constants settings

In [30]:
DATA_PATH = r'..\ClusteringDocumentsDemo'
#HR_DATA_FILENAME = "Start_doing_comments.csv"
HR_DATA_FILENAME = "movie_data.csv"
HR_DOCS = path.join(DATA_PATH, HR_DATA_FILENAME)

### Load the Corpus data from the source

In [48]:
def loadMoviesCorpusData():
    movie_data = pd.read_csv(HR_DOCS)
    print(movie_data.head())
    movie_titles = movie_data['Title'].tolist()
    movie_synopses = movie_data['Synopsis'].tolist()
    print('Movie:{}'.format(movie_titles[0]))
    print('Movie Synopsis:{}'.format(movie_synopses[0]))
    return movie_synopses, movie_titles

#movie_synopses, movie_titles = loadMoviesCorpusData()

### Normalize the Corpus

In [41]:
# normalize corpus
norm_movie_synopses = normalize_corpus(movie_synopses,
                                       lemmatize=True,
                                       only_text_chars=True)

In [42]:
def computeTFIDFDocVectorization(norm_doc):
    # extract tf-idf features
    vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
                                                  feature_type='tfidf',
                                                  min_df=0.24, max_df=0.85,
                                                  ngram_range=(1, 2))
    return vectorizer, feature_matrix

vectorizer, feature_matrix = computeTFIDFDocVectorization(norm_movie_synopses)
# view number of features
print("feature_matrix.shape: {}".format(feature_matrix.shape))     

# get feature names
feature_names = vectorizer.get_feature_names()

# print sample features
print("feature_names[:20]: {}".format(feature_names[:20]))      

feature_matrix.shape: (100, 307)
feature_names[:20]: ['able', 'accept', 'across', 'act', 'agree', 'alive', 'allow', 'alone', 'along', 'already', 'although', 'always', 'another', 'anything', 'apartment', 'appear', 'approach', 'arm', 'army', 'around']


### Apply K-Means Clustering algorithm
    - Cluster the data using K-means algorithm (based on a guessed number of clusters) to group the data into categorises

In [47]:
class KMeansClustering(object):
    """
    Cluster the data using K-means algorithm (based on a guessed number of clusters) to group the data into categorises
    """
    @staticmethod
    def computeClusters(X_train, num_clusters=5, kmeans_type='classic'):
        model = None
        if kmeans_type == 'classic':
            model = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=10000, n_init=1,
                    verbose=is_verbose)
        else:
            model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
                             init_size=1000, batch_size=1000, verbose=is_verbose)
        model.fit(X_train)
        clusters = model.labels_
        return model, clusters
    
    @staticmethod
    def predictClusters(model, X_valid):
        predicted_clusters = km.predict(X_Valid)
        return predicted_clusters
    
    @staticmethod
    def computeNumDataPointsPerCluster(km_clusters):
        c = Counter(km_clusters)
        return c.items()
    
    @staticmethod
    def getClusteredData(model, data, 
                     feature_names, num_clusters,
                     topn_features=10):

        cluster_details = {}  
        # get cluster centroids
        ordered_centroids = model.cluster_centers_.argsort()[:, ::-1]
        # get key features for each cluster
        # get movies belonging to each cluster
        for cluster_num in range(num_clusters):
            cluster_details[cluster_num] = {}
            cluster_details[cluster_num]['cluster_num'] = cluster_num
            key_features = [feature_names[index] 
                            for index 
                            in ordered_centroids[cluster_num, :topn_features]]
            cluster_details[cluster_num]['key_features'] = key_features

            data_point = data[movie_data['Cluster'] == cluster_num]['ID'].values.tolist()
            cluster_details[cluster_num]['data_point'] = data_point

        return cluster_details
    
    @staticmethod
    def displayClusteredData(cluster_data):
        """
        Display cluster details
        """
        for cluster_num, cluster_details in cluster_data.items():
            print('Cluster {} details:'.format(cluster_num))
            print('-'*20)
            print('Key features: {}'.format(cluster_details['key_features']))
            print('Movies in this cluster:')
            print("{}".format( ', '.join(cluster_details['movies'])))
            print('='*40)

    
    

### Main Loop of the Data Clustering Process

In [None]:
def mainLoop():
    np.random.seed(100)
    total_num_clusters = 12
    guess_num_clusters = 5
    is_reduce_dim = False    
    kmeans_algo_type = 'classic'
    num_train_samples = 600
    t0 = time()
    print("Starting Clustering Steps..\n\n")
    print("1. Load the Documents from the disc")
    movie_synopses, movie_titles = loadMoviesCorpusData()
    
    print("2. Process the documents text data and vectorize it to a numeric matrix")
    X, vectorizer = processDoTextAndVectorizeData(docs)
    print("3. Optionally, improve the performance of the unsupervised learning by reducing data dimensionality")
    if is_reduce_dim:
        reduceDataDimensionality(X)
    print("4. Split data into train and validation sets")
    X_train = X[:num_train_samples]
    X_valid = X[num_train_samples:]
    print("5. Cluster the data using K-means algorithm (based on a guessed number of clusters) to group the data into categorises")
    km = applyKmeansClustering(X_train, guess_num_clusters)
    print("6. Report the Performance of the Clustering Algorithm on the Training Data using Silhouette Coefficient")
    reportClusteringPerformance(X_train, km)
    print("7. Predict the Clusters running the tained model on unseen validation data")
    predicted_data = predictClusters(X_valid, km, docs[num_train_samples:])
    print("predicted_data.head(10) = \n{}\n".format(predicted_data.head(20)))
    #print(tabulate(predicted_data.head(10), headers='keys', tablefmt='psql'))
    print("8. Report the Top terms computed Cluster")
    reportTopTermsOfComputedClusters(km, vectorizer, guess_num_clusters)
    t1 = time()
    elasped_time = t1 - t0
    print("Clustering process took {0:.3f} seconds".format(elasped_time))
    print("")
    findOptimalNumberOfClusters(X_train, total_num_clusters)