This program uses K Means cluster tweets pertaining to USNewsHealth articles, using different sizes of k and printing error metrics.

## Import Statements

In [None]:
import numpy as np
import pandas as pd
import string
import re

In [None]:
url = "https://raw.githubusercontent.com/arjund1999learn/DataSets/main/usnewshealth.txt"
data = pd.read_fwf(url, header = None)
display(data)

Unnamed: 0,0,1,2,3,4,5
0,586278450392133633|Thu,Apr,9,21:24:09,0,2015|Planning to hire a personal trainer? Read...
1,586260156155043843|Thu,Apr,9,20:11:28,0,2015|RT @AnnaMedaris: Any dads out their who s...
2,586248551811932160|Thu,Apr,9,19:25:21,0,2015|America's problem with diabetes in one ma...
3,586229697165586432|Thu,Apr,9,18:10:26,0,2015|Think water &amp; fiber will cure your co...
4,586215972731822080|Thu,Apr,9,17:15:53,0,"2015|About to lose it? Here, try one of these ..."
...,...,...,...,...,...,...
1395,550009352690868224|Tue,Dec,30,19:23:43,0,2014|RT @AnnaMedaris: Have you tried a #dance ...
1396,550002825393340417|Tue,Dec,30,18:57:46,0,2014|Going gray early? Here's how to stop it. ...
1397,549982055854247936|Tue,Dec,30,17:35:14,0,"2014|Sure, we all get nervous sometimes. But h..."
1398,549975811408003072|Tue,Dec,30,17:10:26,0,2014|RT @leonardkl: Millions have signed up fo...


## Preprocess Data

In [None]:
def preprocess(data):

  # splittweet id and timestamp
  data[[5,6]] = data[5].str.split("|", expand = True)

  # split and remove urls
  data[6] = data[6].apply(lambda x: re.split('http:\/\/.*', str(x))[0])

  # remove hashtags
  data[6] = data[6].map(lambda x: x.replace('#',''))

  # remove words with @ symbol
  data[6] = data[6].map(lambda x: x.replace('(\@\w+.*?)',""))

  # make lowercase
  data[6] = data[6].str.lower()

  # remove words with @ symbol and "rt" (retweet)
  data[6] = data[6].str.split()
  for index in data.index:
    temp = [x for x in data.loc[index, 6] if not x.startswith(".@") and not x.startswith("@") and x != "rt"]

    # remove punctuation
    data.loc[index, 6] = ' '.join(temp)

  # remove id and time stamp
  data = data.drop(columns=[0,1,2,3,4,5])

  #rename column
  data = data.rename(columns={6: "tweets"})

  # remove duplicates
  data = data.drop_duplicates()
  return data

## Compute Distance

In [None]:
def distance(tweet1, tweet2):
  # remove punctuation
  tweet1 = tweet1.translate(str.maketrans('', '', string.punctuation))
  tweet2 = tweet2.translate(str.maketrans('', '', string.punctuation))

  # make each tweet into a set
  tweet1 = set(tweet1.split(" "))
  tweet2 = set(tweet2.split(" "))

  distance = 1 - len(tweet1.intersection(tweet2)) / len(tweet1.union(tweet2))

  return distance

## Assign Cluster

In [None]:
def assign_cluster(centers, data):
  # make list of clusters the size of k
  clusters = {}
  for length in range(len(centers)):
    clusters[length] = [centers[length]]

  # iterate through all tweets
  for index in data.index:
    curr_tweet = data.loc[index]["tweets"]
    min = 2
    centerindex = 0

    # compare distance to all centers and keep track of min distance
    for center in range(len(centers)):
      dist = distance(centers[center], curr_tweet)
      if dist < min:
        min = dist
        centerindex = center

    # add tweet to cluster of center w/ min distance if it is not already a center
    if curr_tweet not in set(centers):
      clusters[centerindex].append(curr_tweet)

  return clusters

## Find Center

In [None]:
def find_center(cluster):
  # set initial min as distance of first tweet in cluster
  min = 0
  center = cluster[0]
  for compare in cluster:
    min += distance(cluster[0], compare)


  # iterate through all tweets
  for tweet in range(1, len(cluster)):
    total = 0

    # calculate total distance to other tweets
    for compare in cluster:
      total += distance(cluster[tweet], compare)
      if total >= min:
        break

    # if better center found store it and update min
    if total < min:
      min = total
      center = cluster[tweet]

  return center

## k_means Algorithm

In [None]:
def k_means(centers, data):
  # continue until centers dont change
  while(True):
    new_centers = []
    # assign clusters
    clusters = assign_cluster(centers, data)

    # find new centers
    for key in clusters.keys():
      new_centers.append(find_center(clusters[key]))

    # if centers don't change return
    if new_centers == centers:
      return centers, clusters

    # update centers
    centers = new_centers

# Calculate SSE

In [None]:
def SSE(centers, clusters):
    SSE = 0
    # Iterate through clusters
    for k in range(len(centers)):
        # Iterate through tweets in each cluster
        for tweet in clusters[k]:
            # Calculate sum of squared errors for each tweet
            SSE += (distance(centers[k], tweet) ** 2)

    return SSE

# Print Table Entry

In [None]:
def print_table_entry(num_k, num_SSE, clusters):
    for counter, (_, array) in enumerate(clusters.items(), start=1):
        if counter == 1:
            print(num_k, "\t\t", num_SSE, "\t\t", sep='', end='')
            print(f"{counter}: {len(array)} tweets")
        else:
            print(f"\t\t\t\t{counter}: {len(array)} tweets")

    print()

## Main

In [None]:
# Initialize hyper-parameters
k_param = [2, 3, 5, 7, 10, 20, 30]

# Process data
data = preprocess(data)
flattened_data = data.values.flatten()

# Print headers
print("Value of K\t", "SSE\t\t", "Size of Each Cluster", sep='')
print("_________________________________________________________")

# Iterate through several values of the K hyper-parameter
for num_k in k_param:
    centers = []
    # Randomly select the centers
    for i in range(0, num_k, 1):
        centers.append(np.random.choice(flattened_data, replace = False))

    # Perform K_means clustering
    centers, clusters = k_means(centers,data)

    # Calculate SSE
    num_SSE = round(SSE(centers, clusters), 1)

    # Print table entry
    print_table_entry(num_k, num_SSE, clusters)

Value of K	SSE		Size of Each Cluster
_________________________________________________________
2		1175.0		1: 840 tweets
				2: 545 tweets

3		1127.7		1: 507 tweets
				2: 407 tweets
				3: 471 tweets

5		1099.6		1: 515 tweets
				2: 202 tweets
				3: 145 tweets
				4: 305 tweets
				5: 218 tweets

7		1087.3		1: 280 tweets
				2: 334 tweets
				3: 89 tweets
				4: 179 tweets
				5: 297 tweets
				6: 95 tweets
				7: 111 tweets

10		1073.2		1: 302 tweets
				2: 154 tweets
				3: 33 tweets
				4: 43 tweets
				5: 192 tweets
				6: 58 tweets
				7: 111 tweets
				8: 295 tweets
				9: 66 tweets
				10: 131 tweets

20		1043.2		1: 101 tweets
				2: 19 tweets
				3: 20 tweets
				4: 115 tweets
				5: 54 tweets
				6: 88 tweets
				7: 80 tweets
				8: 85 tweets
				9: 39 tweets
				10: 20 tweets
				11: 107 tweets
				12: 92 tweets
				13: 252 tweets
				14: 29 tweets
				15: 55 tweets
				16: 61 tweets
				17: 60 tweets
				18: 29 tweets
				19: 72 tweets
				20: 7 tweets

30		1001.0		1: 50 tw