In [1]:
import os
import pandas as pd
import glob

HOME = os.path.expanduser('~')
path = HOME + '/temp/topics'
files = glob.glob(os.path.join(path, '*.data'))
names = []
texts = []

for file in files:
    df = pd.read_table(file, index_col=None, header=0, encoding='latin1')
    
    name = file.split('/')[-1].split('.')[0]
    names.append(name)
    texts.append(df.to_string())
    
docs = pd.DataFrame({'name':names, 'text': texts})
docs

Unnamed: 0,name,text
0,satellite_garmin_nuvi_255W_gps,...
1,comfort_toyota_camry_2007,...
2,gas_mileage_toyota_camry_2007,...
3,screen_netbook_1005ha,...
4,features_windows7,...
5,mileage_honda_accord_2008,...
6,keyboard_netbook_1005ha,...
7,video_ipod_nano_8gb,...
8,screen_ipod_nano_8gb,...
9,interior_honda_accord_2008,...


In [2]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english',
                    ngram_range=(1,2), min_df=0.05, max_df=0.85)
feat = tf.fit_transform(docs['text'])

  'stop_words.' % sorted(inconsistent))


In [4]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km.fit(feat)
labels = km.labels_
centers = km.cluster_centers_
docs['labels'] = labels
docs

Unnamed: 0,name,text,labels
0,satellite_garmin_nuvi_255W_gps,...,1
1,comfort_toyota_camry_2007,...,4
2,gas_mileage_toyota_camry_2007,...,4
3,screen_netbook_1005ha,...,2
4,features_windows7,...,1
5,mileage_honda_accord_2008,...,4
6,keyboard_netbook_1005ha,...,2
7,video_ipod_nano_8gb,...,0
8,screen_ipod_nano_8gb,...,2
9,interior_honda_accord_2008,...,4


In [5]:
docs[docs['labels']==0].sort_values(by='name')

Unnamed: 0,name,text,labels
28,battery-life_amazon_kindle,...,0
40,battery-life_ipod_nano_8gb,...,0
17,battery-life_netbook_1005ha,...,0
49,performance_netbook_1005ha,...,0
38,sound_ipod_nano_8gb,headphone jack i got a clear case for it a...,0
7,video_ipod_nano_8gb,...,0


In [6]:
docs[docs['labels']==1].sort_values(by='name')

Unnamed: 0,name,text,labels
30,accuracy_garmin_nuvi_255W_gps,...,1
20,directions_garmin_nuvi_255W_gps,...,1
24,display_garmin_nuvi_255W_gps,...,1
4,features_windows7,...,1
0,satellite_garmin_nuvi_255W_gps,...,1
33,screen_garmin_nuvi_255W_gps,...,1
19,speed_garmin_nuvi_255W_gps,...,1
37,updates_garmin_nuvi_255W_gps,...,1
48,voice_garmin_nuvi_255W_gps,...,1


In [7]:
docs[docs['labels']==2].sort_values(by='name')

Unnamed: 0,name,text,labels
12,buttons_amazon_kindle,...,2
14,eyesight-issues_amazon_kindle,...,2
41,fonts_amazon_kindle,...,2
6,keyboard_netbook_1005ha,...,2
35,navigation_amazon_kindle,...,2
8,screen_ipod_nano_8gb,...,2
3,screen_netbook_1005ha,...,2
22,size_asus_netbook_1005ha,...,2
42,speed_windows7,...,2


In [8]:
docs[docs['labels']==3].sort_values(by='name')

Unnamed: 0,name,text,labels
10,bathroom_bestwestern_hotel_sfo,...,3
25,food_holiday_inn_london,...,3
11,food_swissotel_chicago,...,3
16,free_bestwestern_hotel_sfo,...,3
26,location_bestwestern_hotel_sfo,...,3
36,location_holiday_inn_london,...,3
39,parking_bestwestern_hotel_sfo,...,3
13,price_amazon_kindle,...,3
32,price_holiday_inn_london,...,3
21,room_holiday_inn_london,...,3


In [9]:
docs[docs['labels']==4].sort_values(by='name')

Unnamed: 0,name,text,labels
23,comfort_honda_accord_2008,...,4
1,comfort_toyota_camry_2007,...,4
2,gas_mileage_toyota_camry_2007,...,4
9,interior_honda_accord_2008,...,4
47,interior_toyota_camry_2007,...,4
5,mileage_honda_accord_2008,...,4
43,performance_honda_accord_2008,...,4
27,quality_toyota_camry_2007,...,4
44,seats_honda_accord_2008,...,4
45,transmission_toyota_camry_2007,...,4


In [10]:
km = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km.fit(feat)
labels = km.labels_
centers = km.cluster_centers_

docs['labels'] = labels
docs.sort_values(by='labels')

Unnamed: 0,name,text,labels
44,seats_honda_accord_2008,...,0
23,comfort_honda_accord_2008,...,0
45,transmission_toyota_camry_2007,...,0
27,quality_toyota_camry_2007,...,0
47,interior_toyota_camry_2007,...,0
9,interior_honda_accord_2008,...,0
2,gas_mileage_toyota_camry_2007,...,0
1,comfort_toyota_camry_2007,...,0
5,mileage_honda_accord_2008,...,0
43,performance_honda_accord_2008,...,0


In [11]:
centers = km.cluster_centers_
print(centers.shape, centers)

(3, 4611) [[0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.01005322 0.         0.         ... 0.00706287 0.         0.        ]
 [0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]]


In [12]:
def get_cluster_details(model, data, names, nums, top_n=10):
    details = {}
    
    centroid = model.cluster_centers_.argsort()[:, ::-1]
    
    for num in range(nums):
        details[num] = {}
        details[num]['cluster'] = num
        
        indexes = centroid[num, :top_n]
        tfeat = [names[i] for i in indexes]
        
        tfeatval = model.cluster_centers_[num, indexes].tolist()
        
        details[num]['top_features'] = tfeat
        details[num]['top_features_value'] = tfeatval
        
        filenames = data[data['labels']==num]['name']
        filenames = filenames.values.tolist()
        
        details[num]['filenames'] = filenames
    
    return details

In [13]:
def print_cluster_details(details):
    for num, detail in details.items():
        print('### Cluster {0}'.format(num))
        print('Top Features: ', detail['top_features'])
        print('File Name: ', detail['filenames'][:7])
        print('====================================')

In [14]:
fnames = tf.get_feature_names()
details = get_cluster_details(model=km, data=docs, names=fnames, nums=3, top_n=10)
print_cluster_details(details)

### Cluster 0
Top Features:  ['interior', 'seat', 'mileage', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']
File Name:  ['comfort_toyota_camry_2007', 'gas_mileage_toyota_camry_2007', 'mileage_honda_accord_2008', 'interior_honda_accord_2008', 'comfort_honda_accord_2008', 'quality_toyota_camry_2007', 'performance_honda_accord_2008']
### Cluster 1
Top Features:  ['screen', 'battery', 'keyboard', 'battery life', 'life', 'kindle', 'direction', 'video', 'size', 'voice']
File Name:  ['satellite_garmin_nuvi_255W_gps', 'screen_netbook_1005ha', 'features_windows7', 'keyboard_netbook_1005ha', 'video_ipod_nano_8gb', 'screen_ipod_nano_8gb', 'buttons_amazon_kindle']
### Cluster 2
Top Features:  ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']
File Name:  ['bathroom_bestwestern_hotel_sfo', 'food_swissotel_chicago', 'rooms_swissotel_chicago', 'free_bestwestern_hotel_sfo', 'rooms_bestwestern_hotel_sfo', 'room_holid