In [23]:
import string
import collections
 
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
 
 
def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    transtable = {ord(s):None for s in string.punctuation}
    transtable[ord('/')] = u' '
    text = text.translate(transtable)
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens
 
 
def cluster_texts(texts, clusters=10):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.9,
                                 min_df=0.2,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
    
    order_centroids = km_model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(clusters):
        print('Cluster %d:' % i)
    for ind in order_centroids[i, :10]:
        print('%s' % terms[ind])
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

http://stackoverflow.com/questions/23175809/typeerror-translate-takes-one-argument-2-given-python

In [2]:
process_text(" To$kenize text/removing pu&nctua^tion ",stem=False)

['Tokenize', 'text', 'removing', 'punctuation']

In [3]:
import pandas as pd

data = pd.read_csv('../Jobsnew.csv',delimiter='\t',encoding='latin-1')

In [4]:
texts = data.abstract.tolist()

In [5]:
type(texts)

list

In [6]:
tokenisedtext = [process_text(str(text)) for text in texts]

In [7]:
tokenisedtext

[['We',
  'are',
  'look',
  'for',
  'suitabl',
  'applic',
  'to',
  'fill',
  'the',
  'posit',
  'of',
  'Delicatessen',
  'Assist',
  'Manag',
  'and',
  'Delicatessen',
  'Supervisor',
  'at',
  'our',
  'Edmonton',
  'store',
  'The',
  'applic',
  'must',
  'have',
  'at',
  'least',
  '1',
  'year',
  'manag',
  'supervisori',
  'experi',
  'in',
  'a',
  'larg',
  'Delicatessen',
  'Depart',
  'includ',
  'strong',
  'team',
  'leadership',
  'qualiti'],
 ['Bricki',
  'Labour',
  'requir',
  'for',
  'a',
  'small',
  'team',
  'work',
  'with',
  'the',
  'owner',
  'A',
  'good',
  'environ',
  'where',
  'we',
  'are',
  'happi',
  'to',
  'guid',
  'and',
  'teach',
  'our',
  'labour',
  'All',
  'our',
  'Bricklay',
  'have',
  'first',
  'been',
  'labour',
  'and',
  'treat',
  'them',
  'well',
  'Must',
  'have',
  'a',
  'white',
  'card',
  'ABN',
  'work',
  'boot',
  'and',
  'some',
  'experi',
  'To',
  'discuss',
  'the',
  'posit',
  'call',
  'or',
  'text'

In [17]:


joinedabstracts = []
for wordlist in tokenisedtext:
    #print(wordlist)
    backtogether = ''.join([word+' ' for word in wordlist])
    #joinedwords.join([word+' ' for word in wordlist])
    #print(backtogether)
    joinedabstracts.append(backtogether)    
    
print(joinedabstracts[0:21])
        

['We are look for suitabl applic to fill the posit of Delicatessen Assist Manag and Delicatessen Supervisor at our Edmonton store The applic must have at least 1 year manag supervisori experi in a larg Delicatessen Depart includ strong team leadership qualiti ', 'Bricki Labour requir for a small team work with the owner A good environ where we are happi to guid and teach our labour All our Bricklay have first been labour and treat them well Must have a white card ABN work boot and some experi To discuss the posit call or text Megan on 6344 click to reveal or Mark 6344 click to reveal ', 'We are look for someon to take up to the next level and who ha previous work as a Head Chef in a pub environ You will have a strong passion for produc amaz pub grub with present to match ', 'We are seek experienc conveyanc settlement clerk who are keen to take on a new challeng in their career Step away from file manag complet you will have the opportun to use your peopl skill to hit the road and build

In [24]:
cluster_texts(joinedabstracts[0:50])

Cluster 0:
Cluster 1:
Cluster 2:
Cluster 3:
Cluster 4:
Cluster 5:
Cluster 6:
Cluster 7:
Cluster 8:
Cluster 9:
applic
team
experi
posit
look
work
thi
requir
role


defaultdict(list,
            {0: [11, 15, 16, 24, 30],
             1: [3, 6, 10, 13, 31, 32, 34, 35, 37, 42, 45, 46, 49],
             2: [5, 14, 22, 47],
             3: [2, 23, 36],
             4: [28, 43],
             5: [4, 9, 17, 19, 20, 26, 29, 38],
             6: [7, 12, 18],
             7: [8, 33, 40, 44],
             8: [1, 25, 27, 41],
             9: [0, 21, 39, 48]})

In [19]:
joinedabstracts[:4]

['We are look for suitabl applic to fill the posit of Delicatessen Assist Manag and Delicatessen Supervisor at our Edmonton store The applic must have at least 1 year manag supervisori experi in a larg Delicatessen Depart includ strong team leadership qualiti ',
 'Bricki Labour requir for a small team work with the owner A good environ where we are happi to guid and teach our labour All our Bricklay have first been labour and treat them well Must have a white card ABN work boot and some experi To discuss the posit call or text Megan on 6344 click to reveal or Mark 6344 click to reveal ',
 'We are look for someon to take up to the next level and who ha previous work as a Head Chef in a pub environ You will have a strong passion for produc amaz pub grub with present to match ',
 'We are seek experienc conveyanc settlement clerk who are keen to take on a new challeng in their career Step away from file manag complet you will have the opportun to use your peopl skill to hit the road and bu

In [None]:
ls ..

In [None]:
import pandas as pd

data = pd.read_csv('../Jobsnew.csv',delimiter='\t',encoding='latin-1')
data

In [None]:

data['tokenise'] = [process_text(str(a)) for a in data.abstract.tolist()]

In [None]:
cluster_texts(data.tokenise[0])

In [None]:
mylist = ['hello','how','wre', 'you']

In [None]:
new_bag = ''

new_bag.join([x+' ' for x in mylist])

In [None]:
new_bag