In [None]:
# Display and Data Manipulation libraries
import pandas as pd
import numpy as np
import random as rd
import os
from IPython.display import display_html
from itertools import chain, cycle

# Data Visualization libraries
import matplotlib.pyplot as plt
from matplotlib.pyplot import bar
import seaborn as sns
from wordcloud import WordCloud
from nltk.stem.porter import *
from nltk.tokenize.treebank import TreebankWordDetokenizer

# Preprocessing Module libraries
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from collections import defaultdict
from ast import literal_eval

# Clustering Module libraries
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans 

# Classification Module libraries
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split

# Evaluation Module libraries
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import confusion_matrix, pairwise_distances
from scipy import stats

# Model Implementation libraries
import joblib

# PRE-MODULES

## METHODS & CONSTANTS

In [None]:
# Set pandas display options
def set_pandas_display_options() -> None:
    display = pd.options.display
    display.max_columns = 1000
    display.max_rows = 1000
    display.max_colwidth = 199
    display.width = 1000
set_pandas_display_options()

# Display dataframes side by side
def display_side_by_side(*args, titles=cycle([''])):
    html_str= ''
    for df, title in zip(args, chain(titles, cycle(['</br>'])) ):
        html_str += '<th style="text-align:center"><td style="vertical-align:top">'
        html_str += f'<h4 style="text-align: center;">{title}</h4>'
        html_str += df.to_html().replace('table','table style="display:inline"')
        html_str += '</td></th>'
    display_html(html_str, raw=True)
    
# Convert list to list of lists
def extract(lst):
    return [[item] for item in lst]

# Convert list of lists to list (similar to NumPy flatten())
def flatten(lst):
    return [item for sublist in lst for item in sublist]

In [None]:
# Define classes and their colors
classes = ['Negative', 'Neutral', 'Positive']
targets = [-1.0, 0.0, 1.0]
targets_str = ['-1.0', '0.0', '1.0']
colors = ['Red', 'Blue', 'Green']

# Define no. of clusters and classifiers, and metrics
n_clusters = 6
n_classifiers = 4
clustering_metrics = ['Silhouette Coefficient', 'Calinski-Harabasz Index', 'Davies-Bouldin Index']
classifier_metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

# File path
PATH = 'datasets/'

## DATA INITIALIZATION

In [None]:
# Read .CSV and show dataset
df = pd.read_csv(os.path.join(PATH, 'covid19_tweets_dataset_translated.csv')).rename(
    columns={'TWEETS': 'text_original', 'SENTIMENT': 'SENTIMENT SCORE', 'Translated_Tweets': 'text'})
df = df.iloc[0:9160, 0:3]

In [None]:
# Drop the null values if there is any
df = df.dropna()
print(df.info())

# Produce a copy of the dataset
df2 = df.copy()

In [None]:
# Extract tweets list from dataset
hashtags = df2.text_original.str.extractall(r'(\#\w+)').reset_index(drop=True)
hashtags_list = set(hashtags[0].str.lower().tolist()) # lower and remove duplicates
hashtags_list = [entry[1:] for entry in hashtags_list] # remove hashtag symbol
hashtags_list = [re.sub('[^A-Za-z0-9]+', '', entry) for entry in hashtags_list] # remove symbols
hashtags_list = [entry for entry in hashtags_list if not entry.isdigit()] # remove digits
hashtags_list = list(filter(None, hashtags_list)) # remove empty lists

In [None]:
# Save hashtags list as .CSV
np.savetxt(os.path.join(PATH, 'hashtags_list.csv'), hashtags_list, fmt='% s', encoding='utf-8')

## DATA VISUALIZATION

In [None]:
# Count Plot Visualization
abs_values = df2['SENTIMENT SCORE'].value_counts()[targets]
rel_values = df2['SENTIMENT SCORE'].value_counts(normalize=True)[targets].values * 100
lbls = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_values, rel_values)]

bx = sns.countplot(x=df2['SENTIMENT SCORE'], order=df2['SENTIMENT SCORE'].value_counts()[targets].index, palette=colors)
bx.set_title('COVID19 TWEETS DATASET', fontsize=15)
bx.bar_label(container=bx.containers[0], labels=lbls)

# I. PREPROCESSING MODULE

In [None]:
# Remove whitespace between @ and name
def fix_mention(entry):
    pattern = re.compile(r'@ ')
    return re.sub(pattern, '@', entry)

# Remove hyperlinks with whitespace in between
def fix_hyperlink(entry):
    pattern = re.compile(r'https?(\s|)([!:])(\s|)/(\s|)(/|)(\s|)\S+(\s|).(\s|)\S+(\s|)(/|)(\s|)\S+')
    return re.sub(pattern, '', entry)

# Tagging words if it is noun, adjective, verb, or adverb
def tagging_map():
    tag_map = defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    return tag_map

# Save feature names or vocabulary
def save_feature_names(feature_names):
    np.savetxt(os.path.join(PATH, 'feature_names.csv'), feature_names, fmt='% s', encoding='utf-8')


# Preprocess class
class Preprocess:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    # Change all the text to lower case
    def lower(self):
        self.dataframe['text_lower'] = [entry.lower() for entry in self.dataframe['text']]
        return self.dataframe

    # Remove whitespace between mentions and hyperlinks with whitespace in between
    def cleaning_a(self):
        self.dataframe['text_cleaned'] = [fix_mention(entry) for entry in self.dataframe['text_lower']]
        self.dataframe['text_cleaned'] = [fix_hyperlink(entry) for entry in self.dataframe['text_cleaned']]
        return self.dataframe

    # Remove mentions, hyperlinks, and symbols
    def cleaning_b(self):
        self.dataframe['text_cleaned'] = [re.sub('@[A-Za-z0-9_]+', '', entry) for entry in
                                        self.dataframe['text_cleaned']]
        self.dataframe['text_cleaned'] = [re.sub('https?([!:])//\\S+', '', entry) for entry in
                                        self.dataframe['text_cleaned']]
        self.dataframe['text_cleaned'] = [re.sub('[^A-Za-z0-9_ ]+', '', entry) for entry in
                                        self.dataframe['text_cleaned']]
        return self.dataframe

    # Tokenization process
    def tokenization(self):
        self.dataframe['text_tokenized'] = [word_tokenize(entry) for entry in self.dataframe['text_cleaned']]
        return self.dataframe

    # Lemmatization using WordNetLemmatizer()
    def lemmatization(self, hashtags_list):
        tag_map = tagging_map()
        for index, entry in enumerate(self.dataframe['text_tokenized']):
            lemmatized = []
            word_lemmatized = WordNetLemmatizer()
            for word, tag in pos_tag(entry):
                # Removal of stop words and unnecessary text
                if word not in stopwords.words('english') and (word.isalpha() or word in hashtags_list):
                    # Lemmatization of each token is based on tag_map
                    words = word_lemmatized.lemmatize(word, tag_map[tag[0]])
                    lemmatized.append(words)
                    # Processed set of words stored in 'text_lemmatized'
            self.dataframe.loc[index, 'text_preprocessed'] = str(lemmatized)
        return self.dataframe

    def vectorization(self):
        tfidf = TfidfVectorizer()
        x = tfidf.fit_transform(self.dataframe['text_preprocessed'])
        feature_names = tfidf.get_feature_names_out()
        df_vectorized = pd.DataFrame(normalize(x).toarray(), columns=feature_names)
        df_vectorized = pd.concat([df_vectorized, df2['SENTIMENT SCORE']], axis=1)
        return df_vectorized, feature_names


def preprocess(dataframe, hashtags_list):
    df_process = Preprocess(dataframe)
    df_process.lower()
    df_process.cleaning_a()
    df_process.cleaning_b()
    df_process.tokenization()
    return df_process.lemmatization(hashtags_list)

def vectorize(dataframe):
    df_vectorized, feature_names = Preprocess(dataframe).vectorization()
    save_feature_names(feature_names)
    return df_vectorized


In [None]:
# Preprocessing
df2 = preprocess(df2, hashtags_list)

# Vectorization
df_vectorized = vectorize(df2)

# II. CLUSTERING MODULE

In [None]:
# Separate X and y
X = df_vectorized.drop(['SENTIMENT SCORE'], axis=1)
y = df_vectorized['SENTIMENT SCORE']

# Show distribution of instances per cluster
def ModKMeansResults(labels):
    counts = pd.DataFrame(labels).value_counts()
    print(counts)
    print(f'Total number of instances = {sum(counts.tolist())}')

## MODIFIED K-MEANS CLUSTERING
1. PCA & PERCENTILE METHOD
2. WEIGHTED AVERAGE METHOD

In [None]:
# Get Percentile List
def PercentileList(ptiles, dataframe, n_clusters):
    percentile_list = []
    for x in range(1, n_clusters+1):
        percentile = np.percentile(a=dataframe[0], q=ptiles*x, interpolation='linear')
        percentile_list.append(percentile)
    return percentile_list


# Cluster class
class Cluster:
    def __init__(self, X, n_clusters):
        self.X = X
        self.n_clusters = n_clusters

    # Principal Component Analysis (PCA) Method
    def PCAMethod(self, pca_random_state):
        pca = PCA(n_components=2, random_state=pca_random_state)
        return pd.DataFrame(pca.fit_transform(self.X))

    # Percentile Method
    def PercentileMethod(self, dataframe):
        ptiles = 1/self.n_clusters*100
        percentile_list = PercentileList(ptiles, dataframe, self.n_clusters)
        clusters = [list() for x in range(self.n_clusters)]
            
        for index, entry in enumerate(dataframe[0]):
            if entry <= percentile_list[0]:
                clusters[0].append(dataframe.iloc[index].tolist())        
            elif entry <= percentile_list[1]:
                clusters[1].append(dataframe.iloc[index].tolist())
            elif entry <= percentile_list[2]:
                clusters[2].append(dataframe.iloc[index].tolist())
            elif entry <= percentile_list[3]:
                clusters[3].append(dataframe.iloc[index].tolist())
            elif entry <= percentile_list[4]:
                clusters[4].append(dataframe.iloc[index].tolist())
            elif entry <= percentile_list[5]:
                clusters[5].append(dataframe.iloc[index].tolist())
        return [pd.DataFrame(clusters[x]) for x in range(self.n_clusters)]

    # Weighted Average Method
    def WeightedAverage(self):
        return np.array(np.true_divide(self.X.sum(1), (self.X != 0).sum(1)))

    # Define Initial Centroids (PCA & Percentile)
    def InitialCentroidsPP(self, df_clusters): 
        return [[df_clusters[x][y].mean() for y in range(len(df_clusters[x].columns))] for x in range(self.n_clusters)]

    # Define Initial Centroids (Weighted Average)
    def InitialCentroidsWA(self, row_means):
        sorted_means_index = np.argsort(row_means, kind='mergesort', axis=0)
        sorted_means = row_means[sorted_means_index]
        groups = np.array_split(sorted_means, self.n_clusters)
        group_mean = []
        initial_centroids = []

        for index, group in enumerate(groups):
            g_mean = sum(group)/len(group)
            g_mean = np.array([g_mean])
            group_mean.append(g_mean)
            group = np.array(pd.DataFrame(group).iloc[:, 0]).reshape(-1, 1)
            
            if g_mean.ndim == 1:
                g_mean = g_mean.reshape(-1, 1)

            dist_to_centroid = pairwise_distances(group, g_mean, metric='euclidean')
            initial_centroid = np.argmin(dist_to_centroid)
            initial_centroids.append(initial_centroid)
        
        centroids = row_means[initial_centroids]
        return centroids
    

# Modified K-Means (PCA & Percentile)
def ModKMeansPP(X, n_clusters, pca_random_state):
    # PCA & Percentile Method, Define Initial Centroids
    dataframe = Cluster(X, n_clusters)
    df_pca = dataframe.PCAMethod(pca_random_state)
    df_clusters = dataframe.PercentileMethod(df_pca)
    modkmeans_centers = dataframe.InitialCentroidsPP(df_clusters)
    
    # K-Means Clustering Process with Proposed Initial Centroids
    modkmeans = KMeans(n_clusters=n_clusters, init=modkmeans_centers, 
                               random_state=None, n_init=1, max_iter=1500)
    return modkmeans, df_pca

# Modified K-Means (Weighted Average)
def ModKMeansWA(X, n_clusters):
    # Define Initial Centroids through Weighted Average Method
    dataframe = Cluster(X, n_clusters)
    row_means = dataframe.WeightedAverage()
    centroids = extract(dataframe.InitialCentroidsWA(row_means))
    
    # K-Means Clustering Process with Proposed Initial Centroids
    modkmeans = KMeans(n_clusters=n_clusters, init=centroids, n_init=1, max_iter=1500)
    means = row_means.reshape(-1, 1)
    return modkmeans, means

## MODIFIED K-MEANS CLUSTERING

In [None]:
# Modified K-Means Clustering Implementation (PCA & Percentile)
modkmeans_pp, dataframe = ModKMeansPP(X, n_clusters=6, pca_random_state=42)
modkmeans = modkmeans_pp.fit(dataframe)
print('Modified K-Means Clustering (PCA & Percentile)')
ModKMeansResults(modkmeans.labels_)

# Modified K-Means Clustering Implementation (Weighted Average)
modkmeans_wa, means = ModKMeansWA(df_vectorized.drop(['SENTIMENT SCORE'], axis=1), n_clusters=6)
modkmeans2 = modkmeans_wa.fit(means)
print('\nModified K-Means Clustering (Weighted Average)')
ModKMeansResults(modkmeans2.labels_)

## CLUSTER DIVIDING & LABELING

In [None]:
# Add cluster number to each instance in the Tweets Dataset
def Labels(df_vectorized, modkmeans_labels):
    df_vectorized['CLUSTER NUMBER'] = modkmeans_labels
    return df_vectorized


# ClusterDivide class
class ClusterDivide:
    def __init__(self, cluster_X, cluster_y, n_clusters):
        self.cluster_X = cluster_X
        self.cluster_y = cluster_y
        self.n_clusters = n_clusters
    
    # Divide the Tweets Dataset based on clusters
    def DivideCluster(self, df_vectorized):
        for x in range(self.n_clusters):
            self.cluster_X[x] = df_vectorized[df_vectorized['CLUSTER NUMBER'] == x]
        for x in range(self.n_clusters):
            self.cluster_y[x] = pd.Series(self.cluster_X[x]['SENTIMENT SCORE'], dtype='int32')
        return self.cluster_X, self.cluster_y, df_vectorized

    # Drop the 'CLUSTER NUMBER' and 'SENTIMENT SCORE' columns
    def DropColumn(self, df_vectorized):
        for x in range(self.n_clusters):
            self.cluster_X[x] = self.cluster_X[x].drop(['CLUSTER NUMBER', 'SENTIMENT SCORE'], axis=1)
        df_vectorized = df_vectorized.drop(['CLUSTER NUMBER'], axis=1)   
        return self.cluster_X, df_vectorized

    # Display the clusters
    def DisplayCluster(self):
        for x in range(self.n_clusters):
            print('Cluster', str(x), '->', len(self.cluster_X[x]))


class ClusterLabel:
    def __init__(self, series, text):
        self.series = series
        self.text = text
    
    # Detokenizer before using CountVectorizer
    def detokenization(self):
        detokenizer = TreebankWordDetokenizer()

        for item in self.series:
            word = detokenizer.detokenize(literal_eval(item))
            self.text.append(word)
        return self.text

    # Dataframe with TEXT, SENTIMENT SCORE, and CLUSTER NUMBER
    def create_dataframe(self, sentiment_score, modkmeans_labels):
        dataframe = pd.DataFrame(self.text).rename(columns={0: 'TEXT'})
        dataframe['SENTIMENT SCORE'] = sentiment_score
        dataframe['CLUSTER NUMBER'] = modkmeans_labels
        return dataframe


# Cluster Dividing
def cluster_dividing(df_vectorized, modkmeans_labels):
    # Create empty lists
    cluster_X = [list() for x in range(n_clusters)]
    cluster_y = [list() for x in range(n_clusters)]
    
    # Process
    df_vectorized = Labels(df_vectorized, modkmeans_labels)
    cluster_divide = ClusterDivide(cluster_X, cluster_y, n_clusters)
    cluster_X, cluster_y, df_vectorized = cluster_divide.DivideCluster(df_vectorized)
    cluster_X, df_vectorized = cluster_divide.DropColumn(df_vectorized)
    return cluster_X, cluster_y, df_vectorized

# Cluster Labeling using CountVectorizer
def cluster_labeling(n_clusters, top_words, sentiment_score, modkmeans_labels):
    # Initialize
    cv = CountVectorizer()
    cv_clusters = [pd.DataFrame() for x in range(n_clusters)]
    
    # Create dataframe for Cluster Labeling
    cluster_label = ClusterLabel(df2['text_preprocessed'], list())
    cluster_label.detokenization()
    dataframe = cluster_label.create_dataframe(sentiment_score, modkmeans_labels)

    # Get the frequency of each token from the text
    for x in range(n_clusters):
        matrix = cv.fit_transform(dataframe['TEXT'][dataframe['CLUSTER NUMBER'] == x])
        counts = pd.DataFrame(matrix.toarray(), columns = cv.get_feature_names_out())
        counts.loc['Total', :] = counts.sum(axis=0)

        # Create the DataFrame
        freq = counts.iloc[len(counts)-1]
        cv_clusters[x] = pd.DataFrame(freq, dtype='int').sort_values(by = ['Total'], ascending = False)
        cv_clusters[x]['%'] = cv_clusters[x]['Total'].astype(float).transform(lambda x: x / x.sum() * 100).round(2)
        cv_clusters[x] = cv_clusters[x].reset_index().rename(columns = {'index': 'Hashtag'}).iloc[0:top_words]
    return dataframe, cv_clusters

In [None]:
# Divide and label clusters
cluster_X, cluster_y, df_vectorized = cluster_dividing(df_vectorized, modkmeans.labels_)
df_labelled, cv_clusters = cluster_labeling(n_clusters=n_clusters, top_words=5, 
    sentiment_score=df2['SENTIMENT SCORE'], modkmeans_labels=modkmeans.labels_)

## CLUSTER VISUALIZATION

In [None]:
# Bar Plot Visualization on Hashtags per cluster
title_clusters = ['CLUSTER ' + str(x) for x in range(n_clusters)]
for x in range(n_clusters):
    ax = sns.barplot(data=cv_clusters[x], x='Total', y='Hashtag', orient='h')
    ax.set_title(title_clusters[x])
    ax.bar_label(ax.containers[0])
    plt.show()

In [None]:
# Count Plot Visualization on Sentiment Score per cluster
ax = [0, 1, 2, 3, 4, 5]
fig, ((ax[0], ax[1], ax[2]), (ax[3], ax[4], ax[5])) = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

for x in range(n_clusters):  
    try:
        ax[x] = sns.countplot(x=cluster_y[x], ax=ax[x], order=cluster_y[x].value_counts()[targets].index, palette=colors)
        abs_values = cluster_y[x].value_counts()[targets]
        rel_values = cluster_y[x].value_counts(normalize=True)[targets].values * 100
    except:
        ax[x] = sns.countplot(x=cluster_y[x], ax=ax[x], palette=colors)
        abs_values = cluster_y[x].value_counts()
        rel_values = cluster_y[x].value_counts(normalize=True).values * 100
    lbls = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_values, rel_values)]
    ax[x].set_title('CLUSTER ' + str(x), fontsize=15)
    ax[x].bar_label(container=ax[x].containers[0], labels=lbls)

## CLUSTER LABELS
Cluster 0: Government Response<br>
Cluster 1: General Updates<br>
Cluster 2: News Updates<br>
Cluster 3: Case Projections and Predictions<br>
Cluster 4: COVID-19 Updates<br>
Cluster 5: COVID-19 Statistics

# III. CLASSIFICATION MODULE

## CLASSIFICATION METHODS

In [None]:
# Method for Base Estimators classification process
def BaseEstimators(classifiers, n_clusters, cluster_X, cluster_Y, results, random_state, dump):
    # Prepare lists
    Test_X_BE = list()
    Test_Y_BE = list()
    predictions = list()
    scores = list()
    ps_list = list()
    rs_list = list()
    fs_list = list()
    
    for x in range(n_clusters):
        # Prepare Train and Test Data sets
        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(cluster_X[x], cluster_Y[x],
                test_size=0.3, random_state=random_state)
        Test_X_BE.append(Test_X)
        Test_Y_BE.append(Test_Y)

        # Process
        for name, model in classifiers:
            model.fit(Train_X, Train_Y)
            predict = model.predict(Test_X)
            accuracy = accuracy_score(predict, Test_Y)*100
            predictions.append(predict)
            scores.append(accuracy)
            
            if results == 'eval':
                ps_list.append(precision_score(Test_Y, predict, labels=targets, zero_division=0, average='weighted')*100)
                rs_list.append(recall_score(Test_Y, predict, labels=targets, zero_division=0, average='weighted')*100)
                fs_list.append(f1_score(Test_Y, predict, labels=targets, zero_division=0, average='weighted')*100)
            
            if dump:
                joblib.dump(model, 'models/' + name + '_Cluster' + str(x) + '.pkl')
    
    # return accuracy only
    if results == 'acc':
        return scores
    # model evaluation
    elif results == 'eval':
        return scores, ps_list, rs_list, fs_list
    # default; results == 'all'
    else:
        return Test_X_BE, Test_Y_BE, predictions, scores

# Method to initialize ensemble model weights
def EnsembleWeights(scores, n_clusters, n_classifiers, scaler):
    iterations = [x for i in range(n_classifiers) for x in range(n_clusters*n_classifiers) if i == x % n_classifiers]
    initial_weights = []
    scaled_weights = []
    
    # Initial weights
    for x in range(0, n_clusters*n_classifiers, n_classifiers):
        initial_weights.append([y/sum(scores[x:x+n_classifiers]) for y in scores[x:x+n_classifiers]])
    initial_weights = np.array(initial_weights).reshape(n_clusters, n_classifiers)
    
    # Scaled weights
    for x in range(len(initial_weights)):
        scaled_weights.append(scaler.fit_transform(initial_weights[x].reshape(-1, 1)))
    scaled_weights = [flatten(scaled_weights[x]) for x in range(len(scaled_weights))] 

    # Show scaled weights 
    for x in range(n_clusters):
        print('CLUSTER', str(x), 'scaled weights ->', scaled_weights[x])
    return scaled_weights

# Method for Ensemble Model process
def EnsembleModel(models, scaled_weights, n_clusters, cluster_X, cluster_Y, results, random_state, dump):
    # Prepare lists
    Test_X_ENS = list()
    Test_Y_ENS = list()
    predictions_ENS = list()
    scores_ENS = list()
    ps_list = list()
    rs_list = list()
    fs_list = list()

    for x in range(n_clusters):
        # Prepare ensemble model with defined weights
        ensembles = VotingClassifier(estimators=models, weights=scaled_weights[x], voting='hard')
        
        # Prepare Train and Test Data sets
        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(cluster_X[x], cluster_Y[x],
                test_size=0.3, random_state=random_state)
        Test_X_ENS.append(Test_X)
        Test_Y_ENS.append(Test_Y)

        # Fit and evaluate the ensemble method
        ensembles.fit(Train_X, Train_Y)
        predict = ensembles.predict(Test_X)
        accuracy = accuracy_score(predict, Test_Y)*100
        predictions_ENS.append(predict)
        scores_ENS.append(accuracy)
        
        if results == 'eval':
            ps_list.append(precision_score(Test_Y, predict, labels=targets, zero_division=0, average='weighted')*100)
            rs_list.append(recall_score(Test_Y, predict, labels=targets, zero_division=0, average='weighted')*100)
            fs_list.append(f1_score(Test_Y, predict, labels=targets, zero_division=0, average='weighted')*100)
        
        if dump:
            joblib.dump(ensembles, 'models/ENS_Cluster' + str(x) + '.pkl')
    
    # return accuracy only
    if results == 'acc':
        return scores_ENS
    # model evaluation
    elif results == 'eval':
        return scores_ENS, ps_list, rs_list, fs_list
    # default; results == 'all'
    else:
        return Test_X_ENS, Test_Y_ENS, predictions_ENS, scores_ENS

## BASE ESIMATORS TRAINED
('CNB', ComplementNB())<br>
('SVM', LinearSVC(random_state=42))<br>
('LR', LogisticRegression(multi_class='multinomial', random_state=42, solver='saga'))<br>
('RF', RandomForestClassifier(max_depth=120, random_state=42))<br>

## BASE ESTIMATORS

In [None]:
# Prepare classifier algorithms
CNB = naive_bayes.ComplementNB()
SVM = svm.LinearSVC(C=1.0, penalty='l2', dual=True, random_state=42, max_iter=1000)
LR = LogisticRegression(C=1.0, penalty='l2', solver='saga', dual=False, 
        multi_class='multinomial', random_state=42, max_iter=100)
RF = RandomForestClassifier(max_depth=120, random_state=42)

classifiers = [('CNB', CNB), ('SVM', SVM), ('LR', LR), ('RF', RF)]
actual_classifiers = [(items[0] + '_CLUSTER_' + str(int(index/n_classifiers))) for index, items in enumerate(classifiers*n_clusters)]

In [None]:
# Fit and evaluate the base estimators CNB, SVM, LR, & RF
Test_X_BE, Test_Y_BE, predictions, scores = BaseEstimators(classifiers, n_clusters, cluster_X, cluster_y,
        results='all', random_state=42, dump=False)

# Accuracies
for x in range(len(actual_classifiers)):
    print(actual_classifiers[x] + '_accuracy -> %.4f' % scores[x])

## COMBINATION ENSEMBLE MODEL

In [None]:
# Prepare ensemble model weights, lists, and seed
scaler = MinMaxScaler()
scaled_weights = EnsembleWeights(scores, n_clusters, n_classifiers, scaler)

In [None]:
# Fit and evaluate the Ensemble Model
Test_X_ENS, Test_Y_ENS, predictions_ENS, scores_ENS = EnsembleModel(classifiers, scaled_weights, n_clusters, cluster_X, cluster_y,
        results='all', random_state=42, dump=False)

# Accuracy
for x in range(n_clusters):
    print('ENS_CLUSTER_' + str(x) + '_accuracy -> %.4f' % scores_ENS[x])

# IV. EVALUATION MODULE

## CLUSTERING MODELS

In [None]:
# PCA Method for Visualization
df_pca = Cluster(X, n_clusters).PCAMethod(pca_random_state=42)
df_pca['SENTIMENT SCORE'] = y

In [None]:
# Scatter Plot of Modified K-Means
plt.scatter(x=df_pca[0], y=df_pca[1], c=modkmeans.labels_)
plt.scatter(x=modkmeans.cluster_centers_.T[0], y=modkmeans.cluster_centers_.T[1], c='Black', marker='x')
plt.title('Tweets Dataset Modified K-Means Clustering (PCA & Percentile)', fontsize=12)
plt.grid()

### SUMMARY OF METRICS

In [None]:
# Modified K-Means (PCA and Percentile) Silhouette Score, Calinski-Harabasz Index, and Davies-Bouldin Index
print('Modified K-Means (PCA and Percentile) Metrics')
print('Silhouette Score =', silhouette_score(df_pca[[0, 1]], labels=modkmeans.labels_))
print('Calinski-Harabasz Index =', calinski_harabasz_score(df_pca[[0, 1]], labels=modkmeans.labels_))
print('Davies-Bouldin Index =', davies_bouldin_score(df_pca[[0, 1]], labels=modkmeans.labels_))

# Modified K-Means (Weighted Average) Silhouette Score, Calinski-Harabasz Index, and Davies-Bouldin Index
print('\nModified K-Means (Weighted Average) Metrics')
print('Silhouette Score =', silhouette_score(means, labels=modkmeans2.labels_))
print('Calinski-Harabasz Index =', calinski_harabasz_score(means, labels=modkmeans2.labels_))
print('Davies-Bouldin Index =', davies_bouldin_score(means, labels=modkmeans2.labels_))

## CLASSIFIER MODELS

In [None]:
# Prediction data for each base estimator
predictions_BE = [predictions[x] for i in range(n_classifiers) for x in range(n_clusters*n_classifiers) if i == x % n_classifiers] + predictions_ENS
predictions_NAMES = [classifiers[x][0] for x in range(n_classifiers)]
predictions_NAMES.append('ENS')

### CONFUSION MATRIX

In [None]:
def confusion_matrix_array(conf_matrix, Test_Y, predictions, n_clusters, classifier_n):
    for x in range(n_clusters):
        conf_matrix[x] = confusion_matrix(np.array(Test_Y_BE[x]), predictions_BE[x+n_clusters*classifier_n])   
    return conf_matrix

def confusion_matrix_heatmap(conf_matrix, c_algo):  
    cx = [0, 1, 2, 3, 4, 5]
    fig, ((cx[0], cx[1], cx[2]), (cx[3], cx[4], cx[5])) = plt.subplots(nrows=2, ncols=3, figsize=(14, 8.5), layout='tight', dpi=80)

    for x in range(n_clusters):
        cx[x] = sns.heatmap(conf_matrix[x], annot=True, fmt='d', ax=cx[x])
        cx[x].xaxis.set_ticklabels(targets_str)
        cx[x].yaxis.set_ticklabels(targets_str)
        cx[x].set(xlabel='Predicted label', ylabel='True label')
        cx[x].set_title('Matrix_' + c_algo + '_CLUSTER_' + str(x), fontsize=15)

In [None]:
# Classifier Algorithms Confusion Matrix
conf_matrix = [[]] * n_clusters

# Confusion Matrix
for x in range(n_classifiers+1):
    conf_matrix = confusion_matrix_array(conf_matrix, Test_Y_BE, predictions_BE, n_clusters, x)
    confusion_matrix_heatmap(conf_matrix=conf_matrix, c_algo=predictions_NAMES[x])

# POST-MODULES

## MODEL IMPLEMENTATION

In [None]:
# Input entry/entries of strings
user_input = input('Input your sentiment: ')
user_input = pd.DataFrame([user_input], columns={'text'})
display(user_input)

# Get hashtags list
hashtags_testing = pd.read_csv('datasets/hashtags_list.csv', sep=',', header=None)
hashtags_list_testing = flatten(hashtags_testing.values)

# Preprocess entry
process_input = Preprocess(user_input)
process_input.lower()
process_input.cleaning_a()
process_input.cleaning_b()
process_input.tokenization()
data_frame = process_input.lemmatization(hashtags_list_testing)

# Read vocabulary
df_vocabulary = pd.read_csv('datasets/feature_names.csv', sep=',', header=None)
vocabulary = flatten(df_vocabulary.values)

# Check if vocabulary is empty
if data_frame['text_preprocessed'][0] == '[]':
    print('Empty vocabulary, your input likely only contain stop words.')
else:
    # Tfidf Vectorizer
    tfidf_testing = TfidfVectorizer(vocabulary=vocabulary)
    x_testing = tfidf_testing.fit_transform(data_frame['text_preprocessed'])

    # Normalize data
    df = pd.DataFrame(normalize(x_testing).toarray(), columns=tfidf_testing.get_feature_names_out())

    if (df.T[0] == 0.0).all():
        print('Invalid input or your sentiment contains unrecognized text.')

    else:
        # Load ensemble model
        for x in range(n_clusters):
            ensemble_testing = joblib.load('models/ENS_Cluster' + str(x) +'.pkl')

            # Predict sample
            print(f'ENS Cluster {str(x)} predicted {ensemble_testing.predict(df)}')