In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.2-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/225.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.2 cmaes-0.10.0 colorlog-6.7.0 optuna-3.2.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import math
import seaborn as sns
import logging
import random
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [None]:
class KMEANS:

    logging.basicConfig(filename = "KMEANS.log", level = logging.INFO, format='%(message)s')

    def __init__(self, url):

        self.url = url
        self.bbchealth_data_list = []
        self.kmeans_ctroid_fin = {}
        self.power = 2

    def dataPreprocessing(self):

        column_names = ["ID", "Timestamp", "Tweet"]
        drop_column_names = ["ID", "Timestamp"]
        bbchealth_data = pd.read_csv(self.url, delimiter='|', names = column_names)
        bbchealth_data.drop(columns = drop_column_names, inplace=True)
        self.bbchealth_data_list = bbchealth_data.values.tolist()

        self.bbchealth_data_list = [
            ' '.join(word.lower() for word in tweet[0].split() if not word.startswith('@') and not word.startswith('http://') and not word.startswith('https://') and not word.startswith('#') and len(word) > 0)
            for tweet in self.bbchealth_data_list
        ]

        return self.bbchealth_data_list, self.power

    def algorithm(self, intial_cluster_points):

        modified_cp = []

        while modified_cp != intial_cluster_points:
            if modified_cp:
                intial_cluster_points = modified_cp

            kmeans_ctroid = self._create_kmeans_ctroid(intial_cluster_points)
            kmeans_ctroid = self._cluster_points(kmeans_ctroid)

            # New centroid points
            modified_cp = self._update_cluster_points(kmeans_ctroid)
            self.kmeans_ctroid_fin = kmeans_ctroid

    def _create_kmeans_ctroid(self, intial_cluster_points):

        kmeans_ctroid = {cluster_t: [] for cluster_t in intial_cluster_points}
        return kmeans_ctroid

    def _cluster_points(self, kmeans_ctroid):

        for tweet in self.bbchealth_data_list:
            centroid = ''
            distance_minimum = 2
            iterator = iter(kmeans_ctroid)

            while True:
                try:
                    cluster_t = next(iterator)
                    distance_between_tweets = self.JaccardiDistance(tweet, cluster_t)

                    distance_minimum, centroid = min(
                        (distance_minimum, centroid),
                        (distance_between_tweets, cluster_t),
                        key=lambda x: x[0]
                    )

                except StopIteration:
                    break

            kmeans_ctroid[centroid].append(tweet)

        return kmeans_ctroid


    def _update_cluster_points(self, kmeans_ctroid):

        import math

        modified_cp = []
        for key, tweets in kmeans_ctroid.items():
            distance_minimum = math.inf
            tweet_nearest_to_clusters = ''
            iterator = iter(tweets)

            while True:
                try:
                    tweet = next(iterator)
                    dist = sum(self.JaccardiDistance(tweet, others) for others in tweets) / len(tweets)

                    distance_minimum, tweet_nearest_to_clusters = min(
                        (distance_minimum, tweet_nearest_to_clusters),
                        (dist, tweet),
                        key=lambda x: x[0]
                    )

                except StopIteration:
                    break

            modified_cp.append(tweet_nearest_to_clusters)

        return modified_cp


    def JaccardiDistance(self, tweetA, tweetB):

        intersection_of_two_tweets = set(tweetA).intersection(set(tweetB))
        union_of_two_tweets = set(tweetA).union(set(tweetB))
        JaccardiDistance = 1 - len(intersection_of_two_tweets) / len(union_of_two_tweets)

        return JaccardiDistance

    def Within_Cluster_Sum_Of_Squares(self):

        total_error = 0

        centroid_iterator = iter(self.kmeans_ctroid_fin.items())
        while True:
            try:
                centroid, tweets = next(centroid_iterator)
                tweet_iterator = iter(tweets)
                while True:
                    try:
                        tweet = next(tweet_iterator)
                        distance = self.JaccardiDistance(centroid, tweet)
                        total_error = total_error + (distance ** self.power)
                    except StopIteration:
                        break
            except StopIteration:
                break

        return total_error


In [None]:
def objective(trial):
    Kmeans_instance = KMEANS("https://raw.githubusercontent.com/YaswanthAd/KNN_Tweets/main/bbchealth.txt")
    tweets, power = Kmeans_instance.dataPreprocessing()

    k = trial.suggest_int('k', 5, 100, step=5)
    cluster_initalizers = random.sample(tweets, k)
    Kmeans_instance.algorithm(cluster_initalizers)

    return Kmeans_instance.Within_Cluster_Sum_Of_Squares()

if __name__ == '__main__':
    study = optuna.create_study(direction = 'minimize')
    study.optimize(objective, n_trials=20)

    for trial in study.trials:
        logging.info("-------------------------------------------------------------\n")
        logging.info("K: {}, SSE: {}".format(trial.params['k'], trial.value))
        count = 1
        Kmeans_instance = KMEANS("https://raw.githubusercontent.com/YaswanthAd/KNN_Tweets/main/bbchealth.txt")
        tweets, _ = Kmeans_instance.dataPreprocessing()
        cluster_initalizers = random.sample(tweets, trial.params['k'])
        Kmeans_instance.algorithm(cluster_initalizers)
        for key, values in Kmeans_instance.kmeans_ctroid_fin.items():
            logging.info("{} : {} tweets".format(count, len(values)))
            count = count + 1

    best_params = study.best_params
    logging.info("Best Parameters: {}".format(best_params))
    logging.info("Best SSE: {}".format(study.best_value))


[I 2023-08-04 23:24:36,523] A new study created in memory with name: no-name-8a6cd53c-873e-40fe-95aa-4b842862ea40
[I 2023-08-04 23:24:52,368] Trial 0 finished with value: 174.9251613443523 and parameters: {'k': 90}. Best is trial 0 with value: 174.9251613443523.
[I 2023-08-04 23:25:13,875] Trial 1 finished with value: 233.52246465182574 and parameters: {'k': 25}. Best is trial 0 with value: 174.9251613443523.
[I 2023-08-04 23:25:37,733] Trial 2 finished with value: 276.73514991480334 and parameters: {'k': 10}. Best is trial 0 with value: 174.9251613443523.
[I 2023-08-04 23:26:22,357] Trial 3 finished with value: 321.85075953706735 and parameters: {'k': 5}. Best is trial 0 with value: 174.9251613443523.
[I 2023-08-04 23:26:41,064] Trial 4 finished with value: 220.7994657242106 and parameters: {'k': 30}. Best is trial 0 with value: 174.9251613443523.
[I 2023-08-04 23:26:55,153] Trial 5 finished with value: 172.04264792590084 and parameters: {'k': 100}. Best is trial 5 with value: 172.042