# Imports

In [76]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
nltk.download('stopwords')
stop_words=set(stopwords.words("english"))
##############################################################
import string
string.punctuation

[nltk_data] Downloading package stopwords to C:\Users\aman
[nltk_data]     raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Spam vs Ham Dataset

In [77]:
data = pd.read_csv("./Datasets/SMSSpamCollection", sep='\t', names=['label','message'])
data.head(2)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [78]:
list(data.message)[0], data.shape

('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 (5572, 2))

In [79]:
data['label'] = (data.label =='spam').astype(int)
data.head(9)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...


In [80]:
X = data.message 
y = data.label
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3733,), (1839,), (3733,), (1839,))

In [81]:
def text_process(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    return [porter.stem(word) for word in nopunc.split() if word.lower() not in stopwords.words('english')]

## CountVectorizer 

In [82]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer= CountVectorizer(analyzer=text_process)
bow_transformer.fit(X_train)
messages_bow_train = bow_transformer.transform(X_train)
messages_bow_test = bow_transformer.transform(X_test)

In [83]:
(messages_bow_train.todense() != 0).sum(), messages_bow_train.shape

(33657, (3733, 6595))

## TfidfTransformer

In [84]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(messages_bow_train)
tfidf_train = tfidf_transformer.transform(messages_bow_train)
tfidf_test = tfidf_transformer.transform(messages_bow_test)

In [85]:
(tfidf_train.todense() != 0).sum(), tfidf_train.shape

(33657, (3733, 6595))

In [86]:
np.sum(tfidf_train.todense() != 0)

33657

In [87]:
Vector_form_train = np.array(tfidf_train.todense())
Vector_form_test = np.array(tfidf_test.todense())
Vector_form_train.shape, Vector_form_test.shape, y_train.shape,  y_test.shape

((3733, 6595), (1839, 6595), (3733,), (1839,))

# Algorithm

## Using Library

In [133]:
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=4,  linkage='single')
model.fit(Vector_form_test[50:55,:])

AgglomerativeClustering(linkage='single', n_clusters=4)

In [134]:
model.labels_ # same output of manual and using Library for both single and complete

array([0, 0, 3, 2, 1], dtype=int64)

## Manual

In [135]:
np.sum(Vector_form_test[50:55,:].copy() != 0)

40

In [136]:
dMatrix_temp =Vector_form_test[50:55,:].copy()
no_groups = np.arange(0,dMatrix_temp.shape[0]).reshape(-1,1)
while len(set(no_groups[:,-1:].flatten()  )) != 1:
  ## keep combining
  map_labels = no_groups[:,-1].flatten() # it is 1D
  list_labels = list(set(map_labels))
  map_list = list(range(len(list_labels)))
  # maps to original labels
  dict_list = dict(zip(list_labels, map_list))
  rev_dict_list = dict(zip(map_list, list_labels))
  #
  distance_table = np.zeros((len(list_labels),len(list_labels)))
  for class_i in list_labels:
    for class_j in list_labels:
      if class_i < class_j:
        sub_Matrix_class_i = dMatrix_temp[map_labels==class_i]
        sub_Matrix_class_j = dMatrix_temp[map_labels==class_j]

        # single link
        #"""
        min_val = None
        for vector in sub_Matrix_class_j:
          v=np.min(np.sqrt(np.sum(np.square(sub_Matrix_class_i-vector), axis=1)))
          if min_val is None:
            min_val  = v
          else:
            min_val = min(min_val,v)
        distance_table[dict_list[class_i],dict_list[class_j]] = min_val
        distance_table[dict_list[class_j],dict_list[class_i]] = min_val

        # Complete link
        """
        max_val = None
        for vector in sub_Matrix_class_j:
          v=np.max(np.sqrt(np.sum(np.square(sub_Matrix_class_i-vector), axis=1)))
          if max_val is None:
            max_val  = v
          else:
            max_val = max(min_val,v)
        distance_table[dict_list[class_i],dict_list[class_j]] = max_val
        distance_table[dict_list[class_j],dict_list[class_i]] = max_val
        #"""

  print("distance table")
  print(distance_table)
  print()
  print("REGROUP")

  # Let's regroup
  
  max_v = np.max(distance_table)
  for i in range(distance_table.shape[0]):
    distance_table[i,i] = max_v +100
  temp_group=no_groups[:,-1:].flatten()
  min_v = np.min(distance_table)
  a,b = np.where(distance_table==min_v)
  sorted_list=sorted(list(zip(a,b)), key=lambda x: x[0])
  reverse_map_list = [(rev_dict_list[i],rev_dict_list[j]) for i,j in sorted_list]
  # print("*"*40)
  for i,j in reverse_map_list:
    val_i = temp_group[i]
    val_j = temp_group[j]
    min_ij = min(val_i, val_j)
    temp_group[temp_group==val_i] = min_ij
    temp_group[temp_group==val_j] = min_ij
  no_groups = np.hstack((no_groups, temp_group.reshape(-1,1)))
  distance_table[distance_table==min_v] = max_v+100


distance table
[[0.         1.39585017 1.41421356 1.41421356 1.41421356]
 [1.39585017 0.         1.41421356 1.41421356 1.41421356]
 [1.41421356 1.41421356 0.         1.41421356 1.41421356]
 [1.41421356 1.41421356 1.41421356 0.         1.41421356]
 [1.41421356 1.41421356 1.41421356 1.41421356 0.        ]]

REGROUP
distance table
[[0.         1.41421356 1.41421356 1.41421356]
 [1.41421356 0.         1.41421356 1.41421356]
 [1.41421356 1.41421356 0.         1.41421356]
 [1.41421356 1.41421356 1.41421356 0.        ]]

REGROUP


In [137]:
distance_table

array([[101.41421356, 101.41421356, 101.41421356, 101.41421356],
       [101.41421356, 101.41421356,   1.41421356,   1.41421356],
       [101.41421356,   1.41421356, 101.41421356,   1.41421356],
       [101.41421356,   1.41421356,   1.41421356, 101.41421356]])

In [138]:
no_groups

array([[0, 0, 0],
       [1, 0, 0],
       [2, 2, 0],
       [3, 3, 0],
       [4, 4, 0]])

In [140]:
new1 = pd.DataFrame(no_groups)
new1.groupby(list(new1.columns)[::-1]).sum()  # first two are in same group and rest 2/3/4 are in different groups

2,1,0
0,0,0
0,0,1
0,2,2
0,3,3
0,4,4


## complete link

In [141]:
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=4,  linkage='complete')
model.fit(Vector_form_test[50:55,:])

AgglomerativeClustering(linkage='complete', n_clusters=4)

In [142]:
model.labels_ # same output of manual and using Library for both single and complete

array([0, 0, 3, 2, 1], dtype=int64)

In [147]:
dMatrix_temp =Vector_form_test[50:55,:].copy()
no_groups = np.arange(0,dMatrix_temp.shape[0]).reshape(-1,1)
while len(set(no_groups[:,-1:].flatten()  )) != 1:
  ## keep combining
  map_labels = no_groups[:,-1].flatten() # it is 1D
  list_labels = list(set(map_labels))
  map_list = list(range(len(list_labels)))
  # maps to original labels
  dict_list = dict(zip(list_labels, map_list))
  rev_dict_list = dict(zip(map_list, list_labels))
  #
  distance_table = np.zeros((len(list_labels),len(list_labels)))
  for class_i in list_labels:
    for class_j in list_labels:
      if class_i < class_j:
        sub_Matrix_class_i = dMatrix_temp[map_labels==class_i]
        sub_Matrix_class_j = dMatrix_temp[map_labels==class_j]

        # single link
        """
        min_val = None
        for vector in sub_Matrix_class_j:
          v=np.min(np.sqrt(np.sum(np.square(sub_Matrix_class_i-vector), axis=1)))
          if min_val is None:
            min_val  = v
          else:
            min_val = min(min_val,v)
        distance_table[dict_list[class_i],dict_list[class_j]] = min_val
        distance_table[dict_list[class_j],dict_list[class_i]] = min_val

        # Complete link
        """
        max_val = None
        for vector in sub_Matrix_class_j:
          v=np.max(np.sqrt(np.sum(np.square(sub_Matrix_class_i-vector), axis=1)))
          if max_val is None:
            max_val  = v
          else:
            max_val = max(min_val,v)
        distance_table[dict_list[class_i],dict_list[class_j]] = max_val
        distance_table[dict_list[class_j],dict_list[class_i]] = max_val
        #"""

  print("distance table")
  print(distance_table)
  print()
  print("REGROUP")

  # Let's regroup
  
  max_v = np.max(distance_table)
  for i in range(distance_table.shape[0]):
    distance_table[i,i] = max_v +100
  temp_group=no_groups[:,-1:].flatten()
  min_v = np.min(distance_table)
  a,b = np.where(distance_table==min_v)
  sorted_list=sorted(list(zip(a,b)), key=lambda x: x[0])
  reverse_map_list = [(rev_dict_list[i],rev_dict_list[j]) for i,j in sorted_list]
  # print("*"*40)
  for i,j in reverse_map_list:
    val_i = temp_group[i]
    val_j = temp_group[j]
    min_ij = min(val_i, val_j)
    temp_group[temp_group==val_i] = min_ij
    temp_group[temp_group==val_j] = min_ij
  no_groups = np.hstack((no_groups, temp_group.reshape(-1,1)))
  distance_table[distance_table==min_v] = max_v+100


distance table
[[0.         1.39585017 1.41421356 1.41421356 1.41421356]
 [1.39585017 0.         1.41421356 1.41421356 1.41421356]
 [1.41421356 1.41421356 0.         1.41421356 1.41421356]
 [1.41421356 1.41421356 1.41421356 0.         1.41421356]
 [1.41421356 1.41421356 1.41421356 1.41421356 0.        ]]

REGROUP
distance table
[[0.         1.41421356 1.41421356 1.41421356]
 [1.41421356 0.         1.41421356 1.41421356]
 [1.41421356 1.41421356 0.         1.41421356]
 [1.41421356 1.41421356 1.41421356 0.        ]]

REGROUP
distance table
[[0.         1.41421356]
 [1.41421356 0.        ]]

REGROUP


In [148]:
distance_table

array([[101.41421356, 101.41421356],
       [101.41421356, 101.41421356]])

In [149]:
no_groups

array([[0, 0, 0, 0],
       [1, 0, 0, 0],
       [2, 2, 0, 0],
       [3, 3, 3, 0],
       [4, 4, 0, 0]])

In [150]:
new1 = pd.DataFrame(no_groups)
new1.groupby(list(new1.columns)[::-1]).sum()  # first two are in same group and rest 2/3/4 are in different groups array([0, 0, 3, 2, 1], dtype=int64)

3,2,1,0
0,0,0,0
0,0,0,1
0,0,2,2
0,0,4,4
0,3,3,3


## Done