# Part 2: Tweets Clustering Using k-means

Dataset: https://archive.ics.uci.edu/dataset/438/health+news+in+twitter

File: bbchealth.txt

References:


## Preprocessing

In [93]:
import requests

data = requests.get('https://raw.githubusercontent.com/anthea97/TweetsClustering/main/bbchealth.txt')
tweets = data.text.split('\n')

In [94]:
tweets[:5]

['585978391360221184|Thu Apr 09 01:31:50 +0000 2015|Breast cancer risk test devised http://bbc.in/1CimpJF\r',
 '585947808772960257|Wed Apr 08 23:30:18 +0000 2015|GP workload harming care - BMA poll http://bbc.in/1ChTBRv\r',
 "585947807816650752|Wed Apr 08 23:30:18 +0000 2015|Short people's 'heart risk greater' http://bbc.in/1ChTANp\r",
 "585866060991078401|Wed Apr 08 18:05:28 +0000 2015|New approach against HIV 'promising' http://bbc.in/1E6jAjt\r",
 "585794106170839041|Wed Apr 08 13:19:33 +0000 2015|Coalition 'undermined NHS' - doctors http://bbc.in/1CnLwK7\r"]

In [95]:
print("Number of tweets: ", len(tweets))

Number of tweets:  3929


In [96]:
only_tweets = []

for tweet in tweets:
  sections = tweet.split('|')
  only_tweets.append(sections[2])

only_tweets[:5]


['Breast cancer risk test devised http://bbc.in/1CimpJF\r',
 'GP workload harming care - BMA poll http://bbc.in/1ChTBRv\r',
 "Short people's 'heart risk greater' http://bbc.in/1ChTANp\r",
 "New approach against HIV 'promising' http://bbc.in/1E6jAjt\r",
 "Coalition 'undermined NHS' - doctors http://bbc.in/1CnLwK7\r"]

In [97]:
#Remove the tweet id and timestamp
#Remove any word that starts with the symbol @ e.g. @AnnaMedaris
#Remove any hashtag symbols e.g. convert #depression to depression
#Remove any URL
#Convert every word to lowercase

import re

for i, tweet in enumerate(only_tweets):
  tweet = re.sub(r'\B@\w+\b', '', tweet)
  tweet = re.sub(r'#', '', tweet)
  tweet = re.sub(r'\n', '', tweet)
  tweet = re.sub(r'\r', '', tweet)
  tweet = re.sub(r'http\S+|www\S+', '', tweet)

  tweet = tweet.lower()
  only_tweets[i] = tweet

only_tweets[:10]


['breast cancer risk test devised ',
 'gp workload harming care - bma poll ',
 "short people's 'heart risk greater' ",
 "new approach against hiv 'promising' ",
 "coalition 'undermined nhs' - doctors ",
 'review of case against nhs manager ',
 "video: 'all day is empty, what am i going to do?' ",
 "video: 'overhaul needed' for end-of-life care ",
 "care for dying 'needs overhaul' ",
 'video: nhs: labour and tory key policies ']

In [98]:
#Convert each tweet to an unordered set of words
processed_tweets = [set(tweet.split()) for tweet in only_tweets]
processed_tweets[:10]

[{'breast', 'cancer', 'devised', 'risk', 'test'},
 {'-', 'bma', 'care', 'gp', 'harming', 'poll', 'workload'},
 {"'heart", "greater'", "people's", 'risk', 'short'},
 {"'promising'", 'against', 'approach', 'hiv', 'new'},
 {"'undermined", '-', 'coalition', 'doctors', "nhs'"},
 {'against', 'case', 'manager', 'nhs', 'of', 'review'},
 {"'all",
  'am',
  'day',
  "do?'",
  'empty,',
  'going',
  'i',
  'is',
  'to',
  'video:',
  'what'},
 {"'overhaul", 'care', 'end-of-life', 'for', "needed'", 'video:'},
 {"'needs", 'care', 'dying', 'for', "overhaul'"},
 {'and', 'key', 'labour', 'nhs:', 'policies', 'tory', 'video:'}]

## K-Means Algorithm From Scratch

In [99]:
import numpy as np
import statistics

class KMeans:
  def __init__(self, k = 3, max_iter = 10):
    self.k = k
    self.max_iter = max_iter
    self.centroids = None

  def jaccard_distance(self, set_a, set_b):
    num = set_a.intersection(set_b)
    den = set_a.union(set_b)

    return 1-(len(num)/len(den))

  def clusterize(self, data, centroids):
      cluster_dict = {}

      for doc in data:
        distances = []
        for centroid in centroids:
          distances.append(self.jaccard_distance(doc, centroid))

        min_distance = distances.index(min(distances))
        cluster_dict.setdefault(min_distance, [])
        cluster_dict[min_distance].append(doc)

      return cluster_dict

  def calculate_centroid(self, data):
    candidates_dict = {}

    for doc_i in data:
      distances = []
      for doc_j in data:
        distances.append(self.jaccard_distance(doc_i, doc_j))
      #find the average of all jaccard distances for doc_i
      # note: getting the same result with statistics.mean and sum - using sum for faster processing
      avg_distance = sum(distances)
      candidates_dict.setdefault(tuple(doc_i), 1)
      candidates_dict[tuple(doc_i)] = avg_distance

    #Return the candidate that has the least average jaccard distance
    centroid = min(candidates_dict, key = candidates_dict.get)

    return set(centroid)

  def fit(self, data):
    #Randomly select k clusters
    data = np.array(data)
    random_indices = np.random.choice(len(data), self.k, replace=False)
    self.centroids = data[random_indices]

    for _ in range(self.max_iter):
      new_centroids = []
      clusters = self.clusterize(data, self.centroids)

      for doc_set in clusters:
        new_centroids.append(self.calculate_centroid(clusters[doc_set]))

      if np.array_equal(self.centroids, new_centroids):
        break

      self.centroids = new_centroids

    return new_centroids, clusters


  def SSE(self, clusters, centroids):
    sse = 0

    for (key, value) in clusters.items():
      centroid = centroids[key]
      cluster_sum = 0
      for x in value:
        d = self.jaccard_distance(x, centroid)
        dist = d**2
        cluster_sum += dist

      sse += cluster_sum
    return sse


  def cluster_size(self, clusters):
    length = []
    for i, cluster in enumerate(clusters):
      print(len(clusters[i]))
      length.append(len(clusters[i]))

    return length







In [81]:
object = KMeans(k=5, max_iter = 20)

In [82]:
centroids, clusters = object.fit(processed_tweets)

In [83]:
sse = object.SSE(clusters, centroids)

In [84]:
print(sse)

3351.609247325381


In [85]:

object.cluster_size(clusters)

1762
829
424
537
377


[1762, 829, 424, 537, 377]

##

## Compute for different values of k

In [101]:
k_values = [3, 5, 10, 15, 20]
cluster_sizes = {}
sse_values = {}

In [106]:
for k in k_values:
  print("k = ", k, "start")
  model_k = KMeans(k=k)
  centroids, clusters = model_k.fit(processed_tweets)
  csize = model_k.cluster_size(clusters)
  cluster_sizes.setdefault(k, [])
  cluster_sizes[k] = csize
  sse = model_k.SSE(clusters, centroids)
  sse_values.setdefault(k, [])
  sse_values[k] = sse
  print("k = ", k, "end")


k =  3 start
2784
480
665
k =  3 end
k =  5 start
1791
804
601
426
307
k =  5 end
k =  10 start
1294
398
89
677
239
488
202
409
118
15
k =  10 end
k =  15 start
1333
340
81
138
241
47
307
31
461
528
101
72
70
49
130
k =  15 end
k =  20 start
1078
129
36
285
454
361
276
127
240
104
184
43
150
211
24
75
58
53
37
4
k =  20 end


In [108]:
import csv
from google.colab import drive

#comment if not running for the first time
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/output.csv'

with open(file_path, 'w', newline='') as file:
    writer = csv.writer(file)

    writer.writerow(["k", "SSE", "Cluster Sizes"])

    for k in k_values:
        writer.writerow([k, sse_values[k], cluster_sizes[k]])

