In [128]:
import string
import collections
 
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
 
 
def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    transtable = {ord(s):None for s in string.punctuation}
    transtable[ord('/')] = u' '
    text = text.translate(transtable)
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens
 
 
def cluster_texts(texts, clusters=10):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.6,
                                 min_df=0.05,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
    
    order_centroids = km_model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(clusters):
        print('Cluster %d:' % i)
    for ind in order_centroids[i, :10]:
        print('%s' % terms[ind])
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

http://stackoverflow.com/questions/23175809/typeerror-translate-takes-one-argument-2-given-python

In [58]:
process_text(" To$kenize text/removing pu&nctua^tion ",stem=False)

['Tokenize', 'text', 'removing', 'punctuation']

In [113]:
cluster_texts(['hello','goodbye','stay here'])

defaultdict(list, {0: [1], 1: [2], 2: [0]})

In [63]:
ls ..

Demo_code.ipynb         demo.ipynb              [34mpdf[m[m/
Jobsnew.csv             doc_cluster.pkl         resources.md
[34mcode[m[m/                   doc_cluster.pkl_01.npy  the_plan......md
datahack.yml            doc_cluster.pkl_02.npy


In [67]:
import pandas as pd

data = pd.read_csv('../Jobsnew.csv',delimiter='\t',encoding='latin-1')
data

Unnamed: 0,job_id,title,raw_location,location_id,subclasses,salary_type,salary_min,salary_max,raw_job_type,abstract,Segment
0,0184446dde70c6f730c33b6d9581c70a,Delicatessen Supervisor,"Edmonton, QLD 4869",geonamezip:AU-4869-edmonton,6046.0,yearly,40000,55000,Full Time,We are looking for suitable applicants to fill...,1
1,0011b96d99fbb502ec9b375a9ce0a441,Brickies Labourer,"Maroubra, Eastern Suburbs",geonameid:2158651,,yearly,55000,70000,Full-time,Brickies Labourer required for a small team wo...,Unknown
2,019535cfa0fbaca079fc81bb4d107c91,Full Time Head Chef - Newly Renovated Hotel,"Yarraville, Maribyrnong Area",geonameid:2142457,1549.0,yearly,60000,80000,Full-time,We are looking for someone to take up to the n...,1
3,019b6d4cb5d02fe9a6b65022dc23e215,Relationship Manager / Trainer - Conveyancing ...,", , Victoria, Perth,",gadminid:AU-07,6004.0,yearly,70000,95000,,We are seeking experienced conveyancing /settl...,3
4,019bfe646511a972314a66b2ac50de89,"Operation MNGR to$100K+S, Mon-Fri-Days, Close ...",Sydney Region,geonameid:2147714,1549.0,yearly,65000,90000,,This is fundamentally a nuts and bolts role th...,1
5,a624132faf6bf845ac25718aafa3d6bc,Transport Scheduling Coordinator - Concrete In...,"Parafield Gardens, Salisbury Area",geonameid:8349238,,yearly,45000,60000,,"Due to an internal promotion, our client, a we...",Unknown
6,1085a3ef3205d54a7b49638fea566445,experienced cook wanted for TripAdvisor No.1 R...,"Lane Cove, Lane Cove Area",geonameid:2160625,1549.0,yearly,45000,60000,Full-time,,1
7,a62c1392dce4acd5b9664560d7fac9c5,Bar Supervisor | Popular Dynamic Venue | Oppor...,"CBD, Inner West & Eastern Suburbs, Sydney, New...",geonameid:6619280,1332.0,yearly,45000,60000,Full Time,"Due to demand, the company has an amazing oppo...",1
8,10919ef958de64f39bc0b63b11ffc156,CARE SERVICE EMPLOYEE - SUPPORT STREAM,"Australia, Canberra, Australia Capital Territory",geonameid:2172517,6164.0,yearly,35000,50000,,If you are someone that strives to put a smile...,1
9,a68ba7ac691690d3e2ca5f94606611b9,CONSTRUCTION & GENERAL LABOURERS,Sydney Region,geonameid:2147714,,hourly,0,0,,We are looking for construction & general labo...,1


In [89]:

data['tokenise'] = [process_text(str(a)) for a in data.abstract.tolist()]

In [132]:
cluster_texts(data.tokenise[0])

Cluster 0:
Cluster 1:
Cluster 2:
Cluster 3:
Cluster 4:
Cluster 5:
Cluster 6:
Cluster 7:
Cluster 8:
Cluster 9:
delicatessen


defaultdict(list,
            {0: [0,
              1,
              2,
              3,
              4,
              5,
              6,
              7,
              8,
              9,
              10,
              12,
              13,
              14,
              16,
              17,
              18,
              19,
              20,
              21,
              22,
              23,
              24,
              25,
              26,
              27,
              28,
              29,
              30,
              31,
              32,
              33,
              34,
              36,
              37,
              38,
              39,
              40,
              41],
             1: [11, 15, 35]})