# 2.1 Document Clustering with Key Words Extraction (TF-IDF Weightage)

## 2.1.1 Data Import

#### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import copy

import nltk
import gensim
import k_means
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import KMeans 
import matplotlib.pyplot as plt

#### Loading Train and Test Dataset from CSV

In [2]:
train_dataset = pd.read_csv("./TrainTest_Dataset/train_dataset.csv", keep_default_na=False, na_values=[""])
test_dataset = pd.read_csv("./TrainTest_Dataset/test_dataset.csv", keep_default_na=False, na_values=[""])

## 2.1.2 Train and Test Data Pre-Processing

#### Preparing Training Corpus and Pre-Processing Training Data

In [5]:
train_corpus = nltk.corpus.PlaintextCorpusReader("./TrainTest_Transcripts/Train", ".+\.txt")
train_fids = train_corpus.fileids()

In [6]:
# Preparing List of Stop Words and Stemming using NLTK Library 
stop_list = nltk.corpus.stopwords.words("english")
stemmer = nltk.stem.porter.PorterStemmer()

trainData_tokenized = [train_corpus.words(fid) for fid in train_fids] 
trainData_lowerCase = [[w.lower() for w in doc] for doc in trainData_tokenized]
trainData_removedPunct = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in trainData_lowerCase]
trainData_removedStopwords = [[w for w in doc if w not in stop_list] for doc in trainData_removedPunct]
trainData_processed = [[stemmer.stem(w) for w in doc] for doc in trainData_removedStopwords]

#### Preparing Training Corpus and Pre-Processing Training Data

In [7]:
test_corpus = nltk.corpus.PlaintextCorpusReader("./TrainTest_Transcripts/Test", ".+\.txt")
test_fids = test_corpus.fileids()

In [8]:
# Preparing List of Stop Words and Stemming using NLTK Library 
testData_tokenized = [test_corpus.words(fid) for fid in test_fids] 
testData_lowerCase = [[w.lower() for w in doc] for doc in testData_tokenized]
testData_removedPunct = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in testData_lowerCase]
testData_removedStopwords = [[w for w in doc if w not in stop_list] for doc in testData_removedPunct]
testData_processed = [[stemmer.stem(w) for w in doc] for doc in testData_removedStopwords]

#### Converting Train Data into Sparse Vector with TF-IDF Weightage 

In [9]:
# Word dictionary 
dict_train = gensim.corpora.Dictionary(trainData_processed)

# Bag of Words Vector 
bowVec_train = [dict_train.doc2bow(doc) for doc in trainData_processed] 

# TF*IDF Model for Training Data 
tfidfModel_train = gensim.models.TfidfModel(bowVec_train) 

# Sparse Vector with TFIDF Weightage
SparseVec_train = [tfidfModel_train[vec] for vec in bowVec_train] 

In [10]:
tfidfModel_train

<gensim.models.tfidfmodel.TfidfModel at 0x1a451322d0>

#### Extracting Top-N Weighted Words from each Document 

In [11]:
# Function to extract top-N words from Document
def Extract_Doc_Top_N_Tfidf_Words(orginal_list, TopNWords):
    # Sort each document word's TF-IDF weightage (list of tuples) in descending order 
    updated_list = copy.deepcopy(orginal_list)
    for index in range(0,len(updated_list)):
        updated_list[index].sort(key = lambda x: x[1], reverse=True)
        updated_list[index] = updated_list[index][:TopNWords] # Extract the top 100 weighted words  
    return updated_list  

# Function to create new Dictionary based on the new Sparse Vector
def Create_New_Dictionary(sparseVec, original_dict):
    # Converting Bag of Words ID in sparseVec into a list of Tokens 
    # E.g. list of "list of string" e.g. [['SMU','SU'],['Graduate','Wow']]
    tokenized_list = [[original_dict.get(tup[0]) for tup in doc] for doc in sparseVec] 
    return gensim.corpora.Dictionary(tokenized_list)

# Function to create new Sparse Vector based on the new dictionary 
def Create_New_Sparse_Vector(sparseVec, new_dictionary, old_dictionary):    
    # Inverting Dictionary Key and Value for new_dictionary for easier retrieval
    # From [{word_id : word}] to [{word : word_id}]
    inverted_dict = dict(zip(new_dictionary.values(), new_dictionary.keys()))
    new_sparseVec = []
    for doc in sparseVec:
        new_doc = []
        for tup in doc:
            word = old_dictionary.get(tup[0]) # retrieve word from old_dictionary          
            new_doc.append((inverted_dict.get(word),tup[1]))
        new_sparseVec.append(new_doc)
    return new_sparseVec

In [12]:
# Sparse Vector with top-N TF-IDF words from each document. 
# Keep in mind that this contains Word ID of the previous Dictionary  
extracted_SparseVec_train = Extract_Doc_Top_N_Tfidf_Words(SparseVec_train, 50) 

# Creating new set of Dictionary  
new_dict_train = Create_New_Dictionary(extracted_SparseVec_train, dict_train)

# Updating Sparse Vector's word ID 
new_SparseVec_train = Create_New_Sparse_Vector(extracted_SparseVec_train, new_dict_train, dict_train)

# Transforming Sparse Vector into np array to find optimal K 
train_nparray = gensim.matutils.corpus2dense(new_SparseVec_train, len(new_dict_train))

In [203]:
print(len(new_dict_train))

31764


## 2.1.3 Data Clustering

### 2.1.3.1 Identifying Optimal K 
Adapted from: https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f

In [13]:
# Sum_of_squared_distances 25 - 50 
Sum_of_squared_distances = [] 
K = range(25,50)
for k in K:
    print("running: ", k)
    km = KMeans(n_clusters=k)
    km = km.fit(train_nparray)
    Sum_of_squared_distances.append(km.inertia_)

running:  25
running:  26


KeyboardInterrupt: 

In [14]:
# Sum_of_squared_distances 50 - 60 
Sum_of_squared_distances_50_60 = [] 
K = range(50,61)
for k in K:
    print("running: ", k)
    km = KMeans(n_clusters=k)
    km = km.fit(train_nparray)
    Sum_of_squared_distances_50_60.append(km.inertia_)

running:  50
running:  51
running:  52
running:  53
running:  54
running:  55
running:  56
running:  57
running:  58
running:  59
running:  60


In [21]:
# Sum_of_squared_distances 25 - 50 
Sum_of_squared_distances_1_25 = [] 
K = range(1,25)
for k in K:
    print("running: ", k)
    km = KMeans(n_clusters=k)
    km = km.fit(train_nparray)
    Sum_of_squared_distances_1_25.append(km.inertia_)

running:  1
running:  2
running:  3
running:  4
running:  5
running:  6
running:  7
running:  8
running:  9
running:  10
running:  11
running:  12
running:  13
running:  14
running:  15
running:  16
running:  17
running:  18
running:  19
running:  20
running:  21
running:  22
running:  23
running:  24


In [None]:
Sum_of_squared_distances_1_70_top50 = [] 
K = range(1,70)
for k in K:
    print("running: ", k)
    km = KMeans(n_clusters=k)
    km = km.fit(train_nparray)
    Sum_of_squared_distances_1_70_top50.append(km.inertia_)

running:  1
running:  2
running:  3
running:  4
running:  5
running:  6
running:  7
running:  8
running:  9
running:  10
running:  11
running:  12
running:  13
running:  14
running:  15
running:  16
running:  17
running:  18
running:  19
running:  20
running:  21
running:  22
running:  23
running:  24
running:  25
running:  26
running:  27
running:  28
running:  29
running:  30
running:  31
running:  32
running:  33
running:  34
running:  35
running:  36
running:  37
running:  38
running:  39
running:  40
running:  41
running:  42
running:  43


In [None]:
Sum_of_squared_distances_combined = Sum_of_squared_distances_1_25 + Sum_of_squared_distances + Sum_of_squared_distances_50_60
K = range(1,61)
plt.plot(K, Sum_of_squared_distances_combined, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [19]:
# Top 15 words
SSE_1_to_60 = [1200.529,
 1196.3323342672006,
 1194.323539188254,
 1190.3827957081608,
 1189.5341851201583,
 1185.2916176838387,
 1186.2149705667132,
 1178.2161681608866,
 1183.124056918208,
 1179.7356436899272,
 1177.1693003649445,
 1171.1892138455205,
 1172.1641868334086,
 1169.621470092471,
 1171.169605025566,
 1167.7569279579495,
 1162.190902994428,
 1161.3990270622664,
 1166.6206731896218,
 1158.9378446083288,
 1158.555941227098,
 1157.8717206268443,
 1152.6693726678818,
 1155.0102177434894,
 1151.1929803453152,
 1150.7755448315022,
 1151.4212970926314,
 1149.5556119536752,
 1148.7469748856402,
 1144.3938185311195,
 1140.916625454774,
 1145.3388127402693,
 1135.314411028918,
 1142.1188291490348,
 1138.3379662130476,
 1134.0708192422483,
 1137.4154360797877,
 1131.2364474072092,
 1132.2894346857647,
 1130.9126004298303,
 1128.8152346117517,
 1125.6459607688628,
 1123.0622207715935,
 1122.6032038001406,
 1121.420629927655,
 1122.5251307805763,
 1120.778991419416,
 1118.6097317436756,
 1119.7536889152573,
 1117.3965153013648,
 1117.088696365621,
 1111.6940503527014,
 1110.276092021481,
 1111.6020542431054,
 1106.6224304310563,
 1102.7490013282088,
 1103.0655575868923,
 1105.021530971285,
 1102.2340467837746,
 1101.5132227916567]

### 2.1.3.2 Clustering Corpus based on Optimal-K of 48


In [None]:
num_tokens = len(new_dict_train.token2id)
clusters_train = k_means.k_means(new_SparseVec_train, num_tokens, 48)

In [None]:
def covert_to_word_vector(vector, dictionary):    
    new_word_doc = []
    for tup in vector:
        word = dictionary.get(tup[0]) # retrieve word from old_dictionary          
        new_doc.append(word,tup[1])
    return new_word_doc  

In [None]:
from collections import Counter

cluster_index = 0
for cluster in clusters_train:
    # Cluster contains a list of index of new_SparseVec_train
    
    cluster_tfidf = [] 
    cluster_words = [] 
    for sparseVec_index in cluster:
        word_vec = covert_to_word_vector(new_SparseVec_train[sparseVec_index], new_dict_train)
        for tup in word_vec: 
            if tup not in cluster_tfidf:
                cluster_tfidf.append(tup)
            cluster_words.append(tup[0])

    cluster_tfidf.sort(key = lambda x: x[1], reverse=True)
    
    cluster_words = [[word,cluster_words.count(word)] for word in set(cluster_words)]
    cluster_words.sort(key = lambda x: x[1], reverse=True)
    
    print("Cluster ", cluster_index," :")
    print("Top TF-IDF Weighted Words :", cluster_tfidf[:20], )
    print("Top Frequency :", cluster_words[:20], '\n')
    cluster_index += 1 


# 2.1.4 Document Classification 

## 2.1.4.1 Classifying Train Data

### Creating a Laballed Dictionary for Naive Bayes Classifier 

In [None]:
all_labeled_data = []

cluster_index = 0
for cluster in clusters_train:
    cluster_dict = {}
    for sparseVec_index in cluster:
        for tup in new_SparseVec_train[sparseVec_index]: 
            if tup[0] not in cluster_dict.keys():
                cluster_dict[tup[0]] = 1 # Freq is 1 since this value does not matter - Lab 5 
    
    cluster_name = 'Topic ' + str(cluster_index)
    all_labeled_data.append((cluster_dict, cluster_name))
    
    cluster_index += 1

### Creating Naive Bayes Classifier 

In [None]:
classifier = nltk.NaiveBayesClassifier.train(all_labeled_data)

In [None]:
text_dict = {}

for tup in new_SparseVec_train[3]:  
    if tup[0] not in text_dict.keys():
        text_dict[tup[0]] = 1


In [197]:
print(classifier.classify(text_dict))

Topic 0


In [239]:
# Transcript to BOW
def get_result(transcript):
    print(transcript.split())
    #dict_train.doc2bow(doc)

get_result("he hereh. eh h heh e heh ")

['he', 'hereh.', 'eh', 'h', 'heh', 'e', 'heh']


## 2.1.4.2 Classifying Test Data

#### Preparing Training Corpus and Pre-Processing Training Data

In [None]:
bowVec_test = [new_dict_train.doc2bow(doc) for doc in testData_processed]
tfidfModel_test = gensim.models.TfidfModel(bowVec_test) 
SparseVec_test = [tfidfModel_test[vec] for vec in bowVec_test] 

In [None]:
new_SparseVec_test = Extract_Doc_Top_N_Tfidf_Words(SparseVec_test, 50) 

# Convert documents into dict representation.
testData_as_dict = [{id:1 for (id, tf_value) in vec} for vec in new_SparseVec_test]

In [None]:
#For each file, classify and print the label.
for i in range(len(test_fids)):
    print(test_fids[i], '-->', classifier.classify(testData_as_dict[i]))

# Document Retrieval

In [None]:
from gensim import similarities

# DOCUMENT RETRIEVAL????
similarity_index = similarities.SparseMatrixSimilarity(vecs1, len(dictionary))

test_vector = vecs1[100]
sims = similarity_index[test_vector]
sorted_sims = sorted(enumerate(sims), key = lambda item: -item[1])

print(list(enumerate(sorted_sims)))

## Archieved - Ignore the Below

In [None]:
import numpy as np # linear algebra
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
import string
import kmeans


import matplotlib.pyplot as plt
import matplotlib.cm as cm

%matplotlib inline
plt.style.use('fivethirtyeight')


metadata_dataset = pd.read_csv("../Processed_Dataset/cleaned_dataset.csv", keep_default_na=False, na_values=[""])

data = metadata_dataset['transcript']

tfidf = TfidfVectorizer(
    max_features = 8000,
    stop_words = 'english',
     

tfidf.fit(data)
text = tfidf.transform(data)

print(type(tfid))

# def find_optimal_clusters(data, max_k):
#     iters = range(2, max_k+1, 2)
    
#     sse = []
#     for k in iters:
#         sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
#         print('Fit {} clusters'.format(k))
        
#     f, ax = plt.subplots(1, 1)
#     ax.plot(iters, sse, marker='o')
#     ax.set_xlabel('Cluster Centers')
#     ax.set_xticks(iters)
#     ax.set_xticklabels(iters)
#     ax.set_ylabel('SSE')
#     ax.set_title('SSE by Cluster Center Plot')
    
# find_optimal_clusters(text, 150)

#vecs1 = [dictionary.doc2bow(doc) for doc in docs5]

#tf_idf = vecs1.fit_transform(data) 

In [None]:
print(type(data))

In [None]:
from sklearn.datasets import load_breast_cancer
breast = load_breast_cancer()
breast_data = breast.data

print(breast_data)