In [99]:
# imports required
import pandas as pd
import numpy as np
import sklearn
import nltk, re
from nltk.stem import *

In [104]:
# tweets.csv contains the data
T_tweets = pd.read_csv('tweets.csv')
# how the actual data looks like
print(T_tweets.head())

# Dropping all the columns except the 'text' column
T_tweets.drop(T_tweets.columns[[0,1]], axis=1, inplace=True)

# cleaning data
T_tweets = T_tweets['text'].str.lower()
T_tweets = T_tweets.replace(['((www\.[^\s]+)|(https?://[^\s]+))'],['URL'],regex=True)
T_tweets = T_tweets.replace(['[\s]+'], [' '],regex=True)
T_tweets = T_tweets.replace(['#([^\s]+)'], ['\1'],regex=True)

T_tweets.head()

                   id                  created_at  \
0  834019605102358528  ['2017-02-21', '12:39:03']   
1  834019562114936834  ['2017-02-21', '12:38:53']   
2  834016888372023296  ['2017-02-21', '12:28:16']   
3  834004809422356483  ['2017-02-21', '11:40:16']   
4  833913312471560192  ['2017-02-21', '05:36:41']   

                                                text  
0  Siddalinghaiah Jn towards Richmond circle on V...  
1  Peak hour traffic at Shanthala Jn towards Cott...  
2  Peak hour traffic at Police thimmaiah Crl towa...  
3  Peak hour traffic at K.R circle towards Hudson...  
4  #TrafficAlert Protest at Town-hall by a politi...  


0    siddalinghaiah jn towards richmond circle on v...
1    peak hour traffic at shanthala jn towards cott...
2    peak hour traffic at police thimmaiah crl towa...
3    peak hour traffic at k.r circle towards hudson...
4     protest at town-hall by a political party. p...
Name: text, dtype: object

In [107]:
# to work with lists instead of the dataframes
tweets = []
for i in T_tweets:
    tweets.append(i)

In [108]:
# loading nltk stopwords that doesn't have any significance in searching, such as i, me, my, etc.
stopwords = nltk.corpus.stopwords.words('english')

# load nltk's SnowballStemmer as variabled 'stemmer'
stemmer = SnowballStemmer("english")

len(stopwords)

153

In [109]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems



def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [110]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in tweets:
    print(i)
    allwords_stemmed = tokenize_and_stem(i) # for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)


siddalinghaiah jn towards richmond circle on vittal mallya road
peak hour traffic at shanthala jn towards cotton pet main rd, maharani college jn towards kr circle on seshadri rd &amp; ..
peak hour traffic at police thimmaiah crl towards chalukya circle on rajbhavan rd, sbm circle towards majestic on kg road &amp; hal twrds domlur
peak hour traffic at k.r circle towards hudson circle on nrupathunga road, town hall jn from s.j.p road &amp; dalmia jn from jd mara jn on bg rd
 protest at town-hall by a political party. please avoid all roads leading to townhall. congestion expected from 11am to 1pm.
please avoid town hall surrounding roads due to jds protest URL
slow moving traffic at tin factory towards kr puram, k.h double road shanti nagar towards richmond circle.
good morning to all btp friends. follow traffic rules for your safety.
 . . URL
peak hour traffic at h.a.l towards domlur, basaweshwara circle towards cauvery theater circle, madiwala from dairy circle.
peak hour traffic at 

In [111]:
print(len(totalvocab_stemmed))
print(len(totalvocab_tokenized))

112183
112183


In [112]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print(vocab_frame.head())

there are 112183 items in vocab_frame
                         words
siddalinghaiah  siddalinghaiah
jn                          jn
toward                 towards
richmond              richmond
circl                   circle


In [113]:
# Note that the result of this block takes a while to show
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(tweets) #fit the vectorizer to synopses

# (100, 563) means the matrix has 100 rows and 563 columns
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
len(terms)

CPU times: user 7.65 s, sys: 24 ms, total: 7.67 s
Wall time: 14.9 s
(7794, 10)


10

In [114]:
# doing the actual k-means culstering
from sklearn.cluster import KMeans

num_clusters = 4

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 2.66 s, sys: 28 ms, total: 2.68 s
Wall time: 6.28 s


In [85]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()
# clusters show which cluster (0-4) each of the 100 synoposes belongs to
print(len(clusters))
print(clusters)

7794
[3, 0, 0, 0, 3, 2, 3, 1, 2, 0, 0, 0, 2, 3, 2, 3, 2, 3, 1, 0, 0, 3, 0, 3, 0, 3, 3, 0, 0, 2, 3, 2, 0, 0, 3, 0, 0, 3, 1, 3, 1, 2, 2, 1, 3, 3, 2, 1, 0, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 0, 3, 2, 2, 2, 2, 2, 3, 3, 2, 3, 0, 3, 0, 3, 0, 0, 2, 3, 3, 3, 2, 3, 3, 3, 0, 2, 0, 1, 3, 0, 0, 0, 3, 2, 3, 1, 2, 0, 0, 0, 2, 3, 2, 3, 2, 3, 1, 0, 0, 3, 0, 3, 0, 3, 3, 0, 0, 2, 3, 2, 0, 0, 3, 0, 0, 3, 1, 3, 1, 2, 2, 1, 3, 3, 2, 1, 0, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 0, 3, 2, 3, 2, 0, 0, 3, 0, 1, 3, 2, 0, 0, 3, 0, 0, 0, 3, 1, 2, 2, 2, 2, 2, 1, 3, 0, 3, 0, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 3, 0, 3, 0, 3, 0, 0, 2, 3, 3, 3, 2, 3, 3, 3, 0, 2, 0, 1, 3, 0, 0, 0, 3, 2, 3, 1, 2, 0, 0, 0, 2, 3, 2, 3, 2, 3, 1, 0, 0, 3, 0, 3, 0, 3, 3, 0, 0, 2, 3, 2, 0, 0, 3, 0, 0, 3, 1, 3, 1, 2, 2, 1, 3, 3, 2, 1, 0, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 0, 3, 2, 3, 2, 0, 0, 3, 0, 1, 3, 2, 0, 0, 3, 0, 0, 0, 3, 1, 2, 2, 2, 2, 2, 1, 3, 0, 3, 0, 2, 1, 2, 3, 2, 2, 2, 3, 3, 3, 0, 3, 0, 3, 2, 0, 3, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 2

In [86]:
ranks = [int(x) for x in range(1, len(tweets)+1)]

films = { 'rank': ranks, 'tweets': tweets, 'cluster': clusters}

frame = pd.DataFrame(films, index = [clusters])

print(frame) # here the ranking is still 0 to 99

frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

    cluster  rank                                             tweets
3         3     1  Siddalinghaiah Jn towards Richmond circle on V...
0         0     2  Peak hour traffic at Shanthala Jn towards Cott...
0         0     3  Peak hour traffic at Police thimmaiah Crl towa...
0         0     4  Peak hour traffic at K.R circle towards Hudson...
3         3     5  #TrafficAlert Protest at Town-hall by a politi...
2         2     6  Please avoid TOWN HALL surrounding roads due t...
3         3     7  Slow moving traffic at tin factory towards kr ...
1         1     8  Good Morning to all BTP Friends. Follow traffi...
2         2     9               .          . https://t.co/gClryTdH1R
0         0    10  Peak hour traffic at H.A.L towards Domlur, Bas...
0         0    11  Peak hour traffic at Hoody circle towards Maha...
0         0    12  Peak hour traffic at Chalukya circle from Poli...
2         2    13  Take a #note Before starting auto journey clic...
3         3    14  Slow moving tra

2    2456
0    2404
3    2217
1     717
Name: cluster, dtype: int64

In [87]:
grouped = frame['rank'].groupby(frame['cluster']) # groupby cluster for aggregation purposes

grouped.mean()

cluster
0    3757.562396
1    3738.057183
2    4147.997964
3    3823.304014
Name: rank, dtype: float64

In [88]:
from __future__ import print_function

print("Top terms per cluster:")

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d tweets:" % i, end='\n')
    for title in frame.ix[i]['tweets'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

Top terms per cluster:
Cluster 0 words: b'jn', b'circle', b'peak', b'peak', b'hour', b'peak',

Cluster 0 tweets:
 Peak hour traffic at Shanthala Jn towards Cotton pet main Rd, Maharani college Jn towards KR circle on Seshadri Rd &amp; .., Peak hour traffic at Police thimmaiah Crl towards Chalukya circle on Rajbhavan Rd, SBM circle towards Majestic on KG road &amp; HAL twrds Domlur, Peak hour traffic at K.R circle towards Hudson circle on Nrupathunga road, Town hall Jn from S.J.P road &amp; Dalmia Jn from JD mara Jn on Bg Rd, Peak hour traffic at H.A.L towards Domlur, Basaweshwara circle towards Cauvery theater circle, Madiwala from Dairy circle., Peak hour traffic at Hoody circle towards Mahadevapura, Freedom park jn towards K.R circle, Town hall jn from S.J.P road., Peak hour traffic at Chalukya circle from Police thimmaiah circle, K.R circle towards Hudson circle, Krishna floor mil jn towards Okalipura., Peak hour traffic at Hulimavu jn towards Gottigere jn, Modi hospital jn towards 