In [41]:
import re
import random
import decimal

random.seed(0)

# Preprocess data by removing tweet id and timestamp,
# URL, word beginning with @, # (replaces #word with word)
# converts to lowercase
def preprocess(file):
    lines = file.readlines()
    dataset = []
    for line in lines:
        data = line.split("|")
        tweet, tweet_id = data[-1].lower(), data[0] 
        tweet = re.sub(r"http\S+", "", tweet)
        tweet = re.sub(r'#(\S+)', r'\1', tweet)
        tweet = re.sub(r'@[\S]+', '', tweet)
        tweet = re.sub(r'\brt\b', ' ', tweet)
        words = re.findall(r"[\w]+", tweet)
        if len(words) != 0:
            wordset = set(words)
            dataset.append({"id":tweet_id,"words":wordset})
    return dataset

# Computes Jaccard distance between two sets
def computeJaccardDistance(set1, set2):
    return 1 - (len(set1.intersection(set2)) / (len(set1.union(set2)))) 

# Updates each centroid with the tweet having minimum 
# distance to all other tweets in a cluster
def updateCentroids(cluster, tweet_ids, tweets, k, centroids):
    indices = []
    new_centroid_index = []
    new_centroid = []
    for i in range(k):
        indices.append([j for j, u in enumerate(cluster) if u == i])
        m = indices[i]

        if (len(m) != 0):
            new_tweets = [tweets[p] for p in m]
            distances = [[computeJaccardDistance(new_tweets[i], new_tweets[j]) for j in range(len(m))] for i in range(len(m))]
            dist_sum = [sum(i) for i in distances]
            minIndex = dist_sum.index(min(dist_sum))
            new_centroid_index.append(m[minIndex]) 
        else:
            new_centroid_index.append(tweet_ids.index(centroids[i]))
    new_centroid = [tweet_ids[x] for x in new_centroid_index]
    return new_centroid
      
# Computes sum of squared error    
def computeSSE(cluster, centroids, tweet_ids, tweets, k):
    indices_temp = []
    indices = [tweet_ids.index(item) for item in centroids]
    tweet_words = [tweets[x] for x in indices]
    sse = 0
    for i in range(k):
        indices_temp.append([j for j, u in enumerate(cluster) if u == i])
        t = [tweets[x] for x in indices_temp[i]]
        for m in range(len(indices_temp[i])):
            sse += computeJaccardDistance(t[m], tweet_words[i])**2
    return sse

# Prints size of each cluster
def printClusterSize(cluster, k):
    clusters = []
    for i in range(k):
        clusters.append([c for c in cluster if c == i])
    for i in range(k):
        print("Cluster " + str(i+1) + ": " + str(len(clusters[i])) + " tweets")

# Runs KMeans Clustering 
def kmeansClustering(k, centroids, tweet_ids, tweets):  
    print("Clustering Tweets..")
    converged = False  
    while not converged:
        converged = True
        clusterArray = []
        indices = [tweet_ids.index(t_id) for t_id in centroids]
        centroid_tweets = [tweets[i] for i in indices]

        for t in tweets:
            distance = [computeJaccardDistance(t, c) for c in centroid_tweets]
            minIndex = distance.index(min(distance))
            clusterArray.append(minIndex)

        new_centroids = updateCentroids(clusterArray, tweet_ids, tweets, k, centroids)
        
        for i in range(k):
            converged = converged and (new_centroids[i] == centroids[i])
        if not converged:
            centroids[:] = new_centroids[:]
    print("\nClustering done\n")
    sse = computeSSE(clusterArray, centroids, tweet_ids, tweets, k)
    print('Sum of Squared Error: ' + str(sse))
    printClusterSize(clusterArray, k)
    

tweetFile = open("foxnewshealth.txt")
dataset = preprocess(tweetFile)
tweet_ids = [dataset[i]["id"] for i in range(len(dataset))]
tweets = [dataset[i]["words"] for i in range(len(dataset))]
initial_centroids = random.sample(tweet_ids, 25)


kmeansClustering(25, initial_centroids, tweet_ids, tweets)


Clustering Tweets..

Clustering done

Sum of Squared Error: 1578.5526409467507
Cluster 1: 175 tweets
Cluster 2: 53 tweets
Cluster 3: 49 tweets
Cluster 4: 121 tweets
Cluster 5: 9 tweets
Cluster 6: 83 tweets
Cluster 7: 21 tweets
Cluster 8: 63 tweets
Cluster 9: 151 tweets
Cluster 10: 62 tweets
Cluster 11: 67 tweets
Cluster 12: 13 tweets
Cluster 13: 45 tweets
Cluster 14: 119 tweets
Cluster 15: 220 tweets
Cluster 16: 59 tweets
Cluster 17: 18 tweets
Cluster 18: 176 tweets
Cluster 19: 104 tweets
Cluster 20: 32 tweets
Cluster 21: 178 tweets
Cluster 22: 8 tweets
Cluster 23: 17 tweets
Cluster 24: 108 tweets
Cluster 25: 48 tweets
