In this notebook I am going to cluster text documents with k-means algorithm 

In [24]:
#importing the data
import pandas as pd

tweets = pd.read_csv('Tweets.csv', delimiter = ',')
num_rows, num_feature = tweets.shape

print('row number: ', num_rows)
print('feature number: ', num_feature)
print('names of features: ', list(tweets))
#among 15 features the one I'll be working with is 'text' column

row number:  14640
feature number:  15
names of features:  ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']


In [25]:
print(tweets.head(5)) #let's have a look at the content of all columns

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [26]:
# cleaning, tokenizing and stemming of text so that the data will be appropriate for the k-means alorithm to work
import pymorphy2
from nltk.tokenize import word_tokenize
import re

morph = pymorphy2.MorphAnalyzer()

def clear_data(df):
    df['text'] = df['text'].apply(lambda text: text.strip().split('|')[-1].split('http')[0].strip().lower())
    df['text'] = df['text'].apply(lambda text: re.sub('[\?!:)(«»@#$_1234567890#—ツ►๑۩۞۩•*”˜˜”*°°*`virginamerica&]', '', text))
    return df

def tokenize_and_stem(df):
    df['text'] = df['text'].apply(lambda text: word_tokenize(text))
    df['text'] = df['text'].apply(lambda text: [morph.parse(word)[0].normal_form for word in text])
    df['text'] = df['text'].apply(lambda text: ' '.join(text))
    return df

tweets_clean = clear_data(tweets)
tweet_data = tokenize_and_stem(tweets_clean)
tweet_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,wht dhpbu sd .,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,plus you ' ddd ols to th xp ... tky .,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,dd't tody ... ust d to tk oth tp,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,t 's lly ss to blst oboxous `` ttt '' you usts...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,d t 's lly b bd th bout t,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [27]:
# mow I will compute tf-idf matrix, after the removal of stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
stop_en ='stopwords-en.txt' #the list is taken from some github repository 

words = []
for line in open(stop_en):
    line = line.split('\n') 
    line = line[0]
    words.append(line)

words = frozenset(words)

print('Vectorization of documents')

vec = TfidfVectorizer(ngram_range=(1,1),stop_words=(words))
X = vec.fit_transform(tweet_data['text'])
data = vec.fit_transform(tweet_data['text']).toarray()

num_docs, num_feature = X.shape
print('documents number : ', num_docs, 'feature number: ', num_feature)

Vectorization of documents


  'stop_words.' % sorted(inconsistent))


documents number :  14640 feature number:  5725


In [28]:
# let's see what will be used as features in the algorithm
myfeats=vec.get_feature_names()
print(myfeats) # those are the words (or some manifestation of words..)

['bb', 'bbbd', 'bbbsdks', 'bbd', 'bbdos', 'bbklstff', 'bbkthlutodu', 'bbl', 'bbs', 'bbts', 'bbtt', 'bby', 'bbyfood', 'bdbdbd', 'bdbusss', 'bdbussss', 'bddlls', 'bdl', 'bdloff', 'bdly', 'bdlypot', 'bdlypt', 'bdo', 'bdod', 'bdofoss', 'bdot', 'bdploypobl', 'bdpoly', 'bdpsho', 'bds', 'bdsd', 'bdssyb', 'bdsutdd', 'bdt', 'bdustos', 'bdustoxp', 'bdutd', 'bdwbst', 'bdwdth', 'bdwtfobsutdw', 'bdy', 'bdyb', 'bdyl', 'bf', 'bff', 'bffs', 'bfjky', 'bfl', 'bfo', 'bfs', 'bft', 'bfts', 'bftz', 'bh', 'bhd', 'bhdtjh', 'bhlf', 'bhloptyshp', 'bho', 'bhous', 'bhs', 'bhst', 'bht', 'bhxoffl', 'bj', 'bjokf', 'bjt', 'bk', 'bkd', 'bkdow', 'bkfst', 'bkho', 'bkk', 'bklo', 'bklys', 'bkods', 'bkok', 'bkpk', 'bkpks', 'bks', 'bkt', 'bktodlt', 'bktowt', 'bkup', 'bkupt', 'bkupts', 'bkupty', 'bkw', 'bkwds', 'bky', 'bkyd', 'bl', 'blbl', 'blbo', 'bld', 'bldodsh', 'bldsdd', 'bldw', 'blf', 'blfst', 'blfstpot', 'blh', 'blk', 'blkby', 'blkd', 'blkhstoyoth', 'blkld', 'blklst', 'blklstt', 'blksh', 'blkt', 'blkts', 'bll', 'bllb',

In [29]:
# now the clustering procedure can be carried out
from sklearn.cluster import KMeans

num_clusters = 7 # setting the number of clusters

# setting clustering parameters
km = KMeans(n_clusters = num_clusters, 
            init="random", # set random choice of clusters' centers
            max_iter=600, # number of iterations
            algorithm = 'auto')

# fit the algorithm to the data
km.fit(data)

print('clusterization is over')

print(km.labels_) #display an array of cluster numbers for each row
# there are 14640 lines in our collection - that is, 14640 objects 

print('size of km.labels_:', len(km.labels_))
print('Enertia: ', km.inertia_) # display the sum of squared distances of samples to their closest cluster center

clusterization is over
[5 2 5 ... 5 5 2]
size of km.labels_: 14640
Enertia:  13613.131375802448


In [16]:
# let's briefly evaluate what documents are included in this or that cluster.
import collections
clustering = collections.defaultdict(list)

for i, label in enumerate(km.labels_):
        clustering[label].append(i)

print('Content of a document № 1:')
print(tweets['text'][1])
print('Content of a document № 2:')
print(tweets['text'][2])
print('Content of a document № 3:')
print(tweets['text'][3])
print('Content of a document № 4:')
print(tweets['text'][4])
print('Content of a document № 5:')
print(tweets['text'][5])
print('Content of a document № 6:')
print(tweets['text'][6])
print('Content of a document № 7:')
print(tweets['text'][7])

Content of a document № 1:
plus you ' ddd ols to th xp ... tky .
Content of a document № 2:
dd't tody ... ust d to tk oth tp
Content of a document № 3:
t 's lly ss to blst oboxous ttt `` you usts ' fs p ; thy h lttl ous
Content of a document № 4:
d t 's lly b bd th bout t
Content of a document № 5:
sously would py flht fo sts tht dd't h ths ply . t 's lly th oly bd th bout fly
Content of a document № 6:
ys , ly y t fly x ths “ wo wo ’ t o wy
Content of a document № 7:
lly ssd p oppotuty fo wthout hts pody , th .


well, the words are really strange, but they were like this by the moment they were processed

In [30]:
# here, the matrix with 'centroids' is be sorted and then the words closest to the center of the cluster are displayed
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
print(order_centroids.shape)
print(order_centroids)

# finally, getting a list of top features, that is words, for each cluster
print("Displaying top words from each cluster:")

terms = vec.get_feature_names()
for i in range(num_clusters):
    print('--------------------')
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(terms[ind])

(7, 5725)
[[1032 1061 2824 ... 1016 3033 3817]
 [3817 1130 4373 ... 5103 5125 1032]
 [4373 1130 1000 ...  887 3283 1032]
 ...
 [1761 1130 4373 ... 1269 1989  487]
 [5115 1130 1000 ... 1095 1731 1032]
 [2017 1000 1007 ... 1032 4435 1889]]
Displaying top words from each cluster:
--------------------
Cluster 0:
flk
flt
ou
jtblu
ths
stop
dd
xx
pls
buh
--------------------
Cluster 1:
southwst
fo
th
flht
thk
thks
ot
fly
wth
ll
--------------------
Cluster 2:
th
fo
flht
utd
ot
ws
tht
pl
ll
wth
--------------------
Cluster 3:
uswys
fo
th
flht
hold
ot
wth
ll
us
ths
--------------------
Cluster 4:
jtblu
fo
th
flht
thks
ot
thk
wth
tht
wll
--------------------
Cluster 5:
utd
fo
flht
ot
thks
th
ths
tht
wth
thk
--------------------
Cluster 6:
lld
flht
flhtld
flhtd
fo
ws
flhts
th
southwst
hold


the top words are reeeeally weird and don't make sense at all! I would rather do LDA ((