In [9]:
# third parties module
import pandas as pd
import numpy as np
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# local module
from utils import load_data

In [10]:
# load data
data = load_data ('text')
# show some data
print ('\n'.join (np.unique (data['cluster'])))
print (data.head (5))

alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc
                                                text                cluster
0  I was wondering if anyone out there could enli...              rec.autos
1  A fair number of brave souls who upgraded thei...  comp.sys.mac.hardware
2  well folks, my mac plus finally gave up the gh...  comp.sys.mac.hardware
3  \nDo you have Weitek's address/phone number?  ...          comp.graphics
4  From article <C5owCB.n3p@world.std.com>, by to...              sci.space


In [11]:
# remove stop word and basic preprocessing
stopword = nltk.corpus.stopwords.words ('english')
punctuation = string.punctuation + '@<?'
def preprocessing (text):
    # first lowering
    text = text.lower ()
    # replace punctuation
    for punc in punctuation:
        text.replace (punc, ' ')
    # replace stopwords
    words = nltk.word_tokenize (text)
    words = [w for w in words if w not in stopword and len (w) >= 3]
    
    return ' '.join (words)

data['token'] = data['text'].apply (preprocessing)    

In [12]:
# TF-IDF top 1000 --> normalize
vectorizer = TfidfVectorizer (max_features=1000)
features = vectorizer.fit_transform (data['token'])
print (features.shape)

(11314, 1000)


In [13]:
# clustering using K-Means
km = KMeans (n_clusters=10)
print ("Clustering")
km.fit (features)

Clustering


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [14]:
# find label by terms used
centroid_idx = km.cluster_centers_.argsort ()[:,::-1]
terms = vectorizer.get_feature_names ()
for k in range (10):
    label = []
    for t_idx in centroid_idx[k,:5]:
        label += [terms[t_idx]]
    print ("Cluster {}, top centroid : {}".format (k, ' '.join (label)))

Cluster 0, top centroid : edu soon university cs email
Cluster 1, top centroid : would people one like think
Cluster 2, top centroid : god jesus bible christ one
Cluster 3, top centroid : drive scsi drives hard disk
Cluster 4, top centroid : file files windows directory format
Cluster 5, top centroid : game team year games players
Cluster 6, top centroid : use card video monitor system
Cluster 7, top centroid : thanks please anyone mail advance
Cluster 8, top centroid : one new like get know
Cluster 9, top centroid : windows window dos program use


In [15]:
print (km.labels_[:10])
print (data.loc[7, 'cluster'])

[7 7 6 8 8 8 7 3 6 4]
comp.sys.ibm.pc.hardware
