# Clusterização de emails
### André Almeida, 164047
### Igor Torrente, 169820

In [1]:
### Bibliotecas utilizadas ###
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import genfromtxt
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import numpy as np
import nltk
import time
import os

## III. Tratamento de dados

#### Utilizando a base `data.csv`

In [2]:
### Pegando dados do banco de dados (data.csv) ###
database = genfromtxt('Database/data.csv', delimiter=',')

OSError: Database/data.csv not found.

#### Funções de pré-processamento para tratamentos alternativos

In [4]:
def getFiles(path, data_size=19393):
    data = []
    for i in range(0, data_size):
        with open(path + str(i) + ".txt", 'r') as f:
            data.append(f.read())
    return data

def getData(stopwords=False, stem=None):
    print("stopwords:", stopwords, " stem:", stem)
    data_size = 19393
    data_path = "data/"

    if not stopwords  and stem == None:
        return getFiles(data_path + "no-stem/original/original")

    if stopwords and stem == None:
        return getFiles(data_path + "no-stem/sw/sw")

    if stem == "lanc" and not stopwords:
        return getFiles(data_path + "stem/original/lanc/lanc")

    if stem == "lanc" and stopwords:
        return getFiles(data_path + "stem/sw/lanc/lanc")

    if stem == "porter" and not stopwords:
        return getFiles(data_path + "stem/original/porter/porter")

    if stem == "porter" and stopwords:
        return getFiles(data_path + "stem/sw/porter/porter")

    if stem == "snow" and not stopwords:
        return getFiles(data_path + "stem/original/snow/snow")

    if stem == "snow" and stopwords:
        return getFiles(data_path + "stem/sw/snow/snow")

    if stem == "wn" and not stopwords:
        return getFiles(data_path + "stem/original/wn/wn")

    if stem == "wn" and stopwords:
        return getFiles(data_path + "stem/sw/wn/wn")

def printClusterTerms(km, vectorizer):
    assigned_cluster = km.labels_
    cluster_terms = []
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(km.n_clusters):
        print("Cluster {:>2} ({:>4} docs): ".format(i, len([x for x in km.labels_ if x == i])), end='')
        cterms = []
        for ind in order_centroids[i, :10]:
            print(' {}'.format(terms[ind]), end='')
            cterms.append(terms[ind])
        cluster_terms.append(' '.join(cterms))
        print("")
        
def trainCustomSets(data, k):
    global km
    print("Number of clusters:", k)
    vectorizer = TfidfVectorizer(min_df=2, max_df = 800)
    tdm = vectorizer.fit_transform(data)
    km = KMeans(n_clusters=k, random_state=0, n_jobs=(-1)).fit(tdm)
    scr = km.score(tdm)
    print("scr: ", scr)
    printClusterTerms(km, vectorizer)
    return scr

#### Função que aplica PCA

In [5]:
def applyPCA(perc, database):
    pca = PCA(n_components=perc)
    pca.fit(database)
    pca_fit = pca.transform(database)

    soma = 0
    for i in pca.explained_variance_ratio_:
        soma += i
    print("Dados removidos/reduzidos: ", soma)
    
    return pca_fit

## IV. Soluções propostas

### _A. K-Means sem PCA_

In [None]:
# Clusterização #

file = open("Results/kmeansSemPCA.txt", "w")
scr_vector = []
calinski_scr_vector =[]
silhoette_scr_vector = []
for x in range(2,201):
    
    km = KMeans(n_clusters=x, random_state=0, n_jobs=(-1)).fit(database)
    scr = km.score(database)
    calinski_scr = metrics.calinski_harabaz_score(database, km.labels_)
    silhoette_scr = metrics.silhouette_score(database, km.labels_ , metric='euclidean',sample_size=8000)
    
    scr_vector.append(scr)
    calinski_scr_vector.append(calinski_scr)
    silhoette_scr_vector.append(silhoette_scr)
    print(str(scr), "  ",str(calinski_scr), "  ", str(silhoette_scr))
    
    file.write("Kmeans: N°: "+ str(x) +" - score= "+ str(scr) +" calinski= "+ str(calinski_scr)+" silhoette="+str(silhoette_scr)+"\n")    
    file.flush()

file.write("\n\nscore: " + str(scr_vector))
file.write("\n\ncalinski: " + str(calinski_scr_vector))
file.write("\n\nsilhoette: " + str(silhoette_scr_vector))
file.close()

#### Gráficos 

In [None]:
# Gráfico de score vs Nº cluster
plt.plot(range(2,201), scr_vector)
plt.ylabel('Score')
plt.xlabel('Nº clusters')
plt.title('K-Means score vs Nº cluster')
plt.show()

# Gráfico de calinski score vs Nº cluster
plt.plot(range(2,201), calinski_scr_vector)
plt.ylabel('calinski score')
plt.xlabel('Nº clusters')
plt.title('Calinski score vs Nº cluster')
plt.show()

# Gráfico de silhoette score vs Nº cluster
plt.plot(range(2,201), silhoette_scr_vector)
plt.ylabel('silhoette score')
plt.xlabel('Nº clusters')
plt.title('silhoette score vs Nº cluster')
plt.show()

### _B. K-Means com PCA_

In [None]:
## Aplica PCA ##
pca_db = applyPCA(0.85, database)

In [None]:
# Clusterização #

file = open("Results/kmeansComPCA.txt", "w")

matrix_calinski_scr = matrix_scr = matrix_silhoette_scr = [[0]*199 for i in range(5)]
pca_values = [0.85, 0.90, 0.93, 0.95, 0.97]
for x in pca_values:
    pca_db = applyPCA(x, database)
    for y in range(2,201):
        km = KMeans(n_clusters=y, random_state=0, n_jobs=(-1)).fit(pca_db)
        scr = km.score(pca_db)
        calinski_scr = metrics.calinski_harabaz_score(pca_db, km.labels_)
        silhoette_scr = metrics.silhouette_score(pca_db, km.labels_ , metric='euclidean',sample_size=8000)
        
        matrix_scr[pca_values.index(x)][y-2] = scr
        matrix_calinski_scr[pca_values.index(x)][y-2] = calinski_scr
        matrix_silhoette_scr[pca_values.index(x)][y-2] = silhoette_scr
        print(str(scr), "  ",str(calinski_scr), "  ", str(silhoette_scr))
        
        file.write("Kmeans: PCA:"+ str(x) +" N°:"+ str(y)+"  - score= "+ str(scr) +" calinski= "+ str(calinski_scr)+" silhoette="+str(silhoette_scr)+"\n")    
        file.flush()
 
file.write("\n\nscore: " + str(matrix_scr))
file.write("\n\ncalinski: " + str(matrix_calinski_scr))
file.write("\n\nsilhoette: " + str(matrix_silhoette_scr))
file.close()

#### Gráficos 

In [None]:
# Gráfico de score vs Nº cluster
plt.plot(range(2,201), matrix_scr[0])
plt.plot(range(2,201), matrix_scr[1])
plt.plot(range(2,201), matrix_scr[2])
plt.plot(range(2,201), matrix_scr[3])
plt.plot(range(2,201), matrix_scr[4])
plt.ylabel('Score')
plt.xlabel('Nº clusters')
plt.legend(['PCA = 0.85', 'PCA = 0.90', 'PCA = 0.93', 'PCA = 0.95', 'PCA = 0.97' ], loc=4)
plt.title('K-Means score vs Nº cluster')
plt.show()

# Gráfico de calinski score vs Nº cluster
plt.plot(range(2,201), matrix_calinski_scr[0])
plt.plot(range(2,201), matrix_calinski_scr[1])
plt.plot(range(2,201), matrix_calinski_scr[2])
plt.plot(range(2,201), matrix_calinski_scr[3])
plt.plot(range(2,201), matrix_calinski_scr[4])
plt.ylabel('calinski score')
plt.xlabel('Nº clusters')
plt.legend(['PCA = 0.85', 'PCA = 0.90', 'PCA = 0.93', 'PCA = 0.95', 'PCA = 0.97' ], loc=4)
plt.title('Calinski score vs Nº cluster')
plt.show()

# Gráfico de silhoette score vs Nº cluster
plt.plot(range(2,201), matrix_silhoette_scr[0])
plt.plot(range(2,201), matrix_silhoette_scr[1])
plt.plot(range(2,201), matrix_silhoette_scr[2])
plt.plot(range(2,201), matrix_silhoette_scr[3])
plt.plot(range(2,201), matrix_silhoette_scr[4])
plt.ylabel('silhoette score')
plt.xlabel('Nº clusters')
plt.legend(['PCA = 0.85', 'PCA = 0.90', 'PCA = 0.93', 'PCA = 0.95', 'PCA = 0.97' ], loc=4)
plt.title('silhoette score vs Nº cluster')
plt.show()

### _C. K-Means nas bases geradas_

In [None]:
file = open("Results/kmeansCustomSet.txt", "w")
scr_vector = []
calinski_scr_vector =[]
silhoette_scr_vector = []
for x in [47, 80]:
    for y in ["wn", "lanc", "snow", "porter"]:
        scr = trainCustomSets(getData(True, y), x) 

        scr_vector.append(scr)

        file.write("Kmeans: N°: "+ str(x) +" - score= "+ str(scr))
        file.flush()

file.write("\n\nscore: " + str(scr_vector))
file.close()

### Avaliação das prováveis quantidade de clusters

In [6]:
def plotclusters (km, clusters):
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .001 # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = km.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = km.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='.', s=10, linewidths=3,
                color='w', zorder=10)
    plt.title('K-means clustering with '+ str(clusters)+ ' clusters\nCentroids are marked with white dots')
    plt.xlim(x_min+0.85, x_max - 0.9)
    plt.ylim(y_min+0.9, y_max - 0.85)
    plt.xticks(())
    plt.yticks(())
    plt.show()

In [62]:
#Função para obter os medoides e os pontos mais proximos aos medoids
def getMedoid(allset, centroid):
    mindist = float('inf')
    allset = np.array(allset)
    indice = 0
    for i in allset:
        curdist = cdist(np.atleast_2d(centroid), np.atleast_2d(i))[0][0]
        if curdist < mindist and curdist != 0:
            mindist = curdist
            medoid = i
            mindice = indice
        indice = indice +1
    return medoid, mindice

In [8]:
km = KMeans(n_clusters=47, random_state=0, n_jobs=(-1)).fit(database)
km2 = KMeans(n_clusters=80, random_state=0, n_jobs=(-1)).fit(database)

In [53]:
np.set_printoptions(threshold=np.nan)

In [64]:
file = open("Results/Medoids47Clusters.txt", "w")
file2 = open("Results/MaisProximos47Clusters.txt", "w")

file2.write("medoid_id   closest_id\n")
for x in km.cluster_centers_ :
    medoid_position, medoid_id = getMedoid(database,x)
    closest_position, closest_id = getMedoid(database,medoid_position)
    file.write(str(medoid_position)+"\n")
    file2.write(str(medoid_id)+" , "+str(closest_id)+"\n")

file.close()
file2.close()

In [38]:
file = open("Results/Medoids80Clusters.txt", "w")
file2 = open("Results/MaisProximos80Clusters.txt", "w")

medoids = closest_text = medoids2 = closest_text2 = []
for x in km2.cluster_centers_ :
    currmedois, closest = getMedoid(database,x)
    file.write(str(currmedois)+"\n\n")
    file2.write(str(closest)+"\n")

file.close()
file2.close()

In [None]:
# Redução da dimensionalidade do vetor de variáveis e fit com a quantidade de clusters escolhida
reduced_data = PCA(n_components=2).fit_transform(database)
km = KMeans(n_clusters=47, random_state=0, n_jobs=(-1)).fit(reduced_data)
km2 = KMeans(n_clusters=80, random_state=0, n_jobs=(-1)).fit(reduced_data)

In [None]:
# Plot dos diagramas de Voronoi para 47 e 80 clusters
plotclusters(km, 47)
plotclusters(km2, 80)