# Q.6 Clustering 

*   Akshay Bankar (2019201011)

> Import required libraries





In [0]:
import numpy as np
import pandas as pd
import glob
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics.cluster import homogeneity_score
from scipy.stats import norm
import random
from numpy.linalg import inv
from math import sqrt, log, exp, pi
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix
from scipy.spatial import distance



> Define class which performs K-means clustering



In [0]:
class KMeans:
    def __init__(self, k, data, iterate):
        self.dim = np.shape(data)[1] #assuming data stored row-wise
        self.k = k
        self.N = np.shape(data)[0]
        self.data = data
        self.iterations = iterate
        self.centroid = []  #[ [data[random.randint(0, self.N-1)]] for _ in range(self.k)]
        for i in range(self.k):
            centre = np.random.uniform(1,10, self.dim) #for j in range(self.dim)]
            norm_centre = np.linalg.norm(centre)
            self.centroid.append(np.divide(centre, norm_centre))
        #print(self.centroid)
        pred_cluster = [-1]*self.N
                          
    def cluster(self):
        pred_cluster = []
        sum_for_mean = [np.zeros(self.dim)]*self.k
        pred_count = np.zeros(self.k)
        #print(np.shape(self.centroid))
        for i in range(self.N):
            min_dist = float("inf")
            min_dist_cluster = -1
            for j in range(self.k):
                euc_dist = distance.euclidean(self.data[i],np.array(self.centroid[j]))
                if euc_dist < min_dist:
                    min_dist = euc_dist
                    min_dist_cluster = j
            sum_for_mean[min_dist_cluster] = np.add(sum_for_mean[min_dist_cluster], self.data[i])
            pred_count[min_dist_cluster] += 1
            pred_cluster.append(min_dist_cluster)
        for i in range(self.k):
            self.centroid[i] = np.divide(sum_for_mean[i], pred_count[i])
        return pred_cluster
    def run(self):
        for i in range(self.iterations):
            self.pred_cluster = self.cluster()
            #print(self.pred_cluster)
            print("Iteration ", i+1, " done", end = '\t')
        return self.pred_cluster



> The get_data() reads the csv file and loads the text data into pandas dataframe.



In [0]:
def get_data(path):
    files = glob.glob(path)
    cols = ['text', 'label']
    df = pd.DataFrame(columns = cols)
    #print(df)
    df_list = []
    for file in files:
        f = open(file, mode = 'r', errors = 'ignore')
        txt_data = f.read()
        txt_data = ''.join([i if ord(i) < 128 else ' ' for i in txt_data])
        name_split = file.split('.')
        name_split = name_split[0].split('_')
        label = int(name_split[1])-1
        df_list.append([txt_data, label])
        f.close()
    df2 = pd.DataFrame(df_list, columns = cols)
    df = df.append(df2)
    return df

In [0]:
df = get_data('/content/drive/My Drive/SMAI/Assignment-2/q6/dataset/*.txt')



> **Word Vectorization** : It is a general process of turning a collection of text documents into numerical feature vectors.Their are many methods to convert text data to vectors which the model can understand but by far the most popular method is called TF-IDF. This is an acronym than stands for “Term Frequency — Inverse Document” Frequency which are the components of the resulting scores assigned to each word.

    Term Frequency: This summarizes how often a given word appears within a document.
    Inverse Document Frequency: This down scales words that appear a lot across documents.



> TF-IDF are word frequency scores that try to highlight words that are more interesting





In [0]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)#, preprocessor=my_preprocessor, tokenizer=my_tokenizer)#, ngram_range=(1,2),)
df_text = tfidf_vectorizer.fit_transform(df['text'])
df_text = df_text.toarray()

In [0]:
label_np = df['label'].to_numpy()



> **Label Encoding** : Label encode the target variable — This is done to transform Categorical data of string type in the data set into numerical values which the model can understand.



In [15]:
k = 5
iter = 100
kmeans_obj = KMeans(k, df_text, iter)
pred_cluster = kmeans_obj.run()

Iteration  1  done	Iteration  2  done	Iteration  3  done	Iteration  4  done	Iteration  5  done	Iteration  6  done	Iteration  7  done	Iteration  8  done	Iteration  9  done	Iteration  10  done	Iteration  11  done	Iteration  12  done	Iteration  13  done	Iteration  14  done	Iteration  15  done	Iteration  16  done	Iteration  17  done	Iteration  18  done	Iteration  19  done	Iteration  20  done	Iteration  21  done	Iteration  22  done	Iteration  23  done	Iteration  24  done	Iteration  25  done	Iteration  26  done	Iteration  27  done	Iteration  28  done	Iteration  29  done	Iteration  30  done	Iteration  31  done	Iteration  32  done	Iteration  33  done	Iteration  34  done	Iteration  35  done	Iteration  36  done	Iteration  37  done	Iteration  38  done	Iteration  39  done	Iteration  40  done	Iteration  41  done	Iteration  42  done	Iteration  43  done	Iteration  44  done	Iteration  45  done	Iteration  46  done	Iteration  47  done	Iteration  48  done	Iteration  49  done	Iteration  50  done	Iteration



> A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class.
This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won’t change the score value in any way.


    Homogeneity: each cluster contains only members of a single class.

    Completeness: all members of a given class are assigned to the same cluster.





In [18]:
print("Homogeneity Score : ",homogeneity_score(label_np,pred_cluster))

Homogeneity Score :  0.72843769042196
