In [1]:
# imports required
import pandas as pd
import numpy as np
import sklearn
import nltk, re
from nltk.stem import *

In [2]:
# tweets.csv contains the data
T_tweets = pd.read_csv('tweets.csv')
# how the actual data looks like
print(T_tweets.head())

# Dropping all the columns except the 'text' column
T_tweets.drop(T_tweets.columns[[0,1]], axis=1, inplace=True)

# cleaning data
T_tweets = T_tweets['text'].str.lower()
T_tweets = T_tweets.replace(['((www\.[^\s]+)|(https?://[^\s]+))'],['URL'],regex=True)
T_tweets = T_tweets.replace(['[\s]+'], [' '],regex=True)
T_tweets = T_tweets.replace(['#([^\s]+)'], ['\1'],regex=True)

T_tweets.head()

                   id                  created_at  \
0  834019605102358528  ['2017-02-21', '12:39:03']   
1  834019562114936834  ['2017-02-21', '12:38:53']   
2  834016888372023296  ['2017-02-21', '12:28:16']   
3  834004809422356483  ['2017-02-21', '11:40:16']   
4  833913312471560192  ['2017-02-21', '05:36:41']   

                                                text  
0  Siddalinghaiah Jn towards Richmond circle on V...  
1  Peak hour traffic at Shanthala Jn towards Cott...  
2  Peak hour traffic at Police thimmaiah Crl towa...  
3  Peak hour traffic at K.R circle towards Hudson...  
4  #TrafficAlert Protest at Town-hall by a politi...  


0    siddalinghaiah jn towards richmond circle on v...
1    peak hour traffic at shanthala jn towards cott...
2    peak hour traffic at police thimmaiah crl towa...
3    peak hour traffic at k.r circle towards hudson...
4     protest at town-hall by a political party. p...
Name: text, dtype: object

In [3]:
# to work with lists instead of the dataframes
tweets = []
for i in T_tweets:
    tweets.append(i)

In [4]:
# loading nltk stopwords that doesn't have any significance in searching, such as i, me, my, etc.
stopwords = nltk.corpus.stopwords.words('english')

# load nltk's SnowballStemmer as variabled 'stemmer'
stemmer = SnowballStemmer("english")

len(stopwords)

153

In [5]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems



def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [6]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in tweets:
    allwords_stemmed = tokenize_and_stem(i) # for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)


In [7]:
print(len(totalvocab_stemmed))
print(len(totalvocab_tokenized))

112183
112183


In [8]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print(vocab_frame.head())

there are 112183 items in vocab_frame
                         words
siddalinghaiah  siddalinghaiah
jn                          jn
toward                 towards
richmond              richmond
circl                   circle


In [9]:
# Note that the result of this block takes a while to show
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(tweets) #fit the vectorizer to synopses

# (100, 563) means the matrix has 100 rows and 563 columns
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
len(terms)

CPU times: user 7.78 s, sys: 24 ms, total: 7.81 s
Wall time: 7.81 s
(7794, 10)


10

In [10]:
# doing the actual k-means culstering
from sklearn.cluster import KMeans

num_clusters = 4

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 2.8 s, sys: 0 ns, total: 2.8 s
Wall time: 2.8 s


In [11]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()
# clusters show which cluster (0-4) each of the 100 synoposes belongs to
print(len(clusters))
#print(clusters)

7794


In [12]:
ranks = [int(x) for x in range(1, len(tweets)+1)]

films = { 'rank': ranks, 'tweets': tweets, 'cluster': clusters}

frame = pd.DataFrame(films, index = [clusters])

print(frame.head())

frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

   cluster  rank                                             tweets
3        3     1  siddalinghaiah jn towards richmond circle on v...
2        2     2  peak hour traffic at shanthala jn towards cott...
2        2     3  peak hour traffic at police thimmaiah crl towa...
2        2     4  peak hour traffic at k.r circle towards hudson...
0        0     5   protest at town-hall by a political party. p...


1    2833
2    2404
0    1303
3    1254
Name: cluster, dtype: int64

In [13]:
grouped = frame['rank'].groupby(frame['cluster']) # groupby cluster for aggregation purposes

grouped.mean()

cluster
0    3959.650806
1    4082.456760
2    3757.562396
3    3683.341308
Name: rank, dtype: float64

In [14]:
from __future__ import print_function

print("Top terms per cluster:")

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d tweets:" % i, end='\n')
    for title in frame.ix[i]['tweets'].head().tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

Top terms per cluster:
Cluster 0 words: b'traffic', b'road', b'circle', b'url', b'peak', b'peak',

Cluster 0 tweets:
  protest at town-hall by a political party. please avoid all roads leading to townhall. congestion expected from 11am to 1pm., slow moving traffic at tin factory towards kr puram, k.h double road shanti nagar towards richmond circle., good morning to all btp friends. follow traffic rules for your safety.,  , ", today cubbon park will be closed, no vehicles will be allowed inside cubbon park, till monday 8am.,

Cluster 1 words: b'url', b'traffic', b'road', b'circle', b'hour', b'peak',

Cluster 1 tweets:
 please avoid town hall surrounding roads due to jds protest URL,  . . URL, take a  before starting auto journey click driver details (display card). it will be helpful if u forget anyth URL,  URL, today's the "no honking monday". lets avoid honking unnecessarily and turn this campaign into a reality. " URL,

Cluster 2 words: b'jn', b'circle', b'peak', b'peak', b'hour