In [1]:
import os
import math as m
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

### Calculating TF-IDF Vectors

In [2]:
def load_docIDs():
    curr_dir = os.getcwd() # get the current directory
    docID = [] # create a list to store the document IDs
    for i in os.listdir(curr_dir + '\ResearchPapers'): # loop through each file in the 'ResearchPapers' directory
        i = i.rstrip('.txt')
        docID.append(int(i))
    docID.sort()
    return docID

def load_stopwords():
    stopwords = []
    f = open('Stopword-List.txt', 'r') # open the 'Stopword-List.txt' file
    while True:
        line = f.readline() # each line from the file is read one by one
        if not line: # if the line read is empty (which means end of file), the loop is broken
            break
        stopwords.append(line) # else append the read line to the stopwords list

    f.close() # close the file

    for i in range(len(stopwords)):
        if i != '\n' and i != '':
            stopwords[i] = stopwords[i].rstrip(' \n') # remove newline characters from the strings
        else:
            stopwords.pop(i) # remove any empty strings and newline characters from the stopwords list
    return stopwords


def calc_TF(total_tokens):
    tf = {} # declare an empty dictionary for the term frequency weights
    porter_stemmer = PorterStemmer() # initialize the stemmer
    doc = load_docIDs() # get the docIDs

    for i, tokens in enumerate(total_tokens): # loop through each token in total_tokens, and then loop through each word in the token
        for word in tokens:
            word = porter_stemmer.stem(word) # stem the word
            if word[-1] == "'": # if the word ends with an apostrophe, remove it
                word = word.rstrip("'")
            if word in tf: # if the word is already in the index, add the docID to the index
                if doc[i] in tf[word]:
                    freq = tf[word][doc[i]]
                    tf[word][doc[i]] = freq + 1
                else:
                    tf[word][doc[i]] = 1
            else: # add the word in the index along with the docID and the frequency
                tf[word] = {doc[i]: 1}

    for word in tf.keys(): # calculate the log term frequency weights
        for doc in tf[word].keys():
            tf[word][doc] = 1 + m.log(tf[word][doc], 10) # normalize the term frequency weights

    tf = pd.DataFrame(tf) # convert the dictionary to a Pandas DataFrame
    tf = tf.transpose() # since the DataFrame will be in the form of words as columns and docIDs as rows, we transpose it to have docIDs as columns and words as rows
    tf.fillna(0, inplace=True) # fill the NaN values with 0

    print("Term Frequency Weights created")
    return tf


def calc_IDF(tf):
    df = {} # a dictionary to store the document frequency of each word
    idf = {} # a dictionary to store the inverse document frequency of each word
    doc = load_docIDs() # get the docIDs

    for keys in tf.index: # loop through each word in the index
        frequency = list(tf.loc[keys].value_counts()) # get the frequency of each word in the document
        df[keys] = len(doc) - frequency[0] # calculate the document frequency of each word

    for keys in tf.index: # loop through each word in the document frequency index
        idf[keys] = m.log(len(doc)/df[keys], 10) # calculate the inverse document frequency of each word

    idf = pd.DataFrame(idf, index=[0]) # convert the dictionary to a Pandas DataFrame

    print("Inverse Document Frequency Weights calculated")
    return idf


def preprocessing():
    total_tokens = [] # an empty list to store the tokens from all the files
    doc = load_docIDs() # get the docIDs
    stopwords = load_stopwords() # get the stopwords
    stemmer = PorterStemmer() # create a stemmer object

    for i in doc: # iterate through each doc
        tokens = []
        with open('ResearchPapers/' + str(i) + '.txt', 'r') as f: # open the file corresponding to the current document ID
            while True:
                text = f.readline() # read a line from the file
                if not text: # if the line is empty (which means end of file), break the loop
                    break
                tokens += word_tokenize(text) # tokenize the line and add the tokens to the list

        j = 0
        while j < len(tokens): # loop through each token
            if tokens[j] not in stopwords and len(tokens[j]) <= 45: # filter out the stopwords and tokens with length greater than 45
                # remove symbols and numbers from the start and end of the token and also apply case folding
                tokens[j] = tokens[j].strip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').casefold()
                if '.' in tokens[j]: # if '.' exists in a word, split the word at that point and add the splitted words at the end of the tokens list while removing the original word
                    word = tokens[j].split('.')
                    del tokens[j]
                    tokens.extend(word)
                elif '-' in tokens[j]: # do the same for words with '-'
                    word = tokens[j].split('-')
                    del tokens[j]
                    tokens.extend(word)
            j += 1 # move the index forward
        tokens = [stemmer.stem(c) for c in tokens if c.isalpha() and c not in stopwords and len(c) >= 2] # filter out any strings that contain symbols, numbers, etc.
        total_tokens.append(tokens) # add the processed tokens as a seperate list. Did this to keep track of which tokens appear in which docs (needed to construct indexes). List at index 0 indicate tokens found in doc 1 and so on.

    return total_tokens


def calc_TFIDF(TF, IDF):
    vector_space = pd.DataFrame(index=TF.index, columns=TF.columns) # create a DataFrame with the same index and columns as the TF DataFrame
    for term in TF.index: # loop through each term in the index
        if pd.isna(term): # special case of 'nan' term
            vector_space.loc[term] = TF.loc[term] * IDF.loc[0, 'nan']
        else: # calculate the TF-IDF weights
            vector_space.loc[term] = TF.loc[term] * IDF.loc[0, term]
    
    vector_space = vector_space.transpose()

    print("TF-IDF Weights calculated")
    return vector_space


def save_weights():
    tokens = preprocessing() # preprocessing function is called, returns the processed tokens
    tf = calc_TF(tokens) # calculate_TF function is called, returns the TF weights
    idf = calc_IDF(tf) # create_positional_index function is called, returns the IDF weights
    tf_idf = calc_TFIDF(tf, idf) # calculate the TF-IDF weights

    tf_idf.to_csv('tf-idf.csv') # output TF-IDF DataFrame to CSV including the index
    print("TF-IDF Weights saved")

In [3]:
if (not os.path.isfile('tf-idf.csv')): # check if the indexes already exist, if they don't, call the save_indexes function
    save_weights()
else:
    print("Weights are already calculated")

Term Frequency Weights created
Inverse Document Frequency Weights calculated
TF-IDF Weights calculated
TF-IDF Weights saved


### Text-Classification

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.cluster import contingency_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, silhouette_score, adjusted_rand_score
plt.style.use('ggplot')

In [5]:
def extract_weights():
    TF_IDF = pd.read_csv('tf-idf.csv', index_col=0)  # read TF-IDF DataFrame from CSV
    return TF_IDF

In [6]:
def add_labels(tf_idf):
    label1 = ['1', '2', '3', '7'] # label for "Explainable Artificial Intelligence"
    label2 = ['8', '9', '11'] # label for "Heart Failure"
    label3 = ['12', '13', '14', '15', '16'] # label for "Time Series Forecasting"
    label4 = ['17', '18', '21'] # label for "Transformer Model"
    label5 = ['22', '23', '24', '25', '26'] # label for "Feature Selection"

    for index in tf_idf.index:
        if index in label1:
            tf_idf['label'] = 1
        elif index in label2:
            tf_idf['label'] = 2
        elif index in label3:
            tf_idf['label'] = 3
        elif index in label4:
            tf_idf['label'] = 4
        elif index in label5:
            tf_idf['label'] = 5
    return tf_idf

In [7]:
def training_model(tf_idf):
    # split the data into training and testing sets
    y = tf_idf['label'].apply(int) # convert the labels to integers
    X = tf_idf.drop('label', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% training and 20% testing

    # train the model
    model = KNeighborsClassifier(n_neighbors=5) # create a KNN model
    model.fit(X_train, y_train) # train the model

    return model, X_test, y_test

In [8]:
def evaluation_metric(model, X_test, y_test):
    # test the model
    y_pred = model.predict(X_test)

    metric = {}

    # evaluate the model
    metric['accuracy'] = accuracy_score(y_test, y_pred)
    metric['recall'] = recall_score(y_test, y_pred, average='weighted')
    metric['f1'] = f1_score(y_test, y_pred, average='weighted')
    metric['precision'] = precision_score(y_test, y_pred, average='weighted')

    return metric

### Text Clustering

In [9]:
def plot_cluster(df, max_k):
    # using elbow method for finding the optimal number of clusters

    sse = [] # a list to store sum of squared errors
    for k in range(2, max_k+1, 2): # loop through the range of clusters
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(df) # fit the model
        sse.append(kmeans.inertia_)

    plt.plot(range(2, max_k+1, 2), sse, marker='o') # plot the number of clusters against the sum of squared errors

In [10]:
def model_train(df):
    kmeans = KMeans(n_clusters=19, random_state=42) # create a KMeans model with 19 clusters
    y_predict = kmeans.fit_predict(df) # fit the model and predict the clusters
    df['cluster'] = y_predict # add the predicted clusters to the DataFrame
    return kmeans, df

In [11]:
def purity_score(y_true, y_pred):
    contingency = contingency_matrix(y_true, y_pred) # Compute contingency matrix
    cluster_purities = np.sum(np.max(contingency, axis=0)) # Sum of the maximum values in each cluster (cluster purity)
    total_samples = np.sum(contingency) # Total number of samples
    purity = cluster_purities / total_samples # Compute purity score
    return purity


def clust_metrics(df, y_predict):
    metrics = {} # a dictionary to store the evaluation metrics

    df = add_labels(df)

    # print evaluation metrics
    metrics["Purity"] = purity_score(df['label'], y_predict)
    metrics["Silhouette Score"] = silhouette_score(df, y_predict)
    metrics["Random Index"] = adjusted_rand_score(df['label'], y_predict)
    return metrics

### GUI

In [19]:
import tkinter as tk
from tkinter import ttk, Canvas # for combobox

In [20]:
def classification_train_and_eval():
    tf_idf = extract_weights()
    tf_idf = add_labels(tf_idf)
    model, X_test, y_test = training_model(tf_idf)
    metrics = evaluation_metric(model, X_test, y_test)
    return metrics
    # Display classification metrics

# Function to perform clustering
def clustering_train_and_eval():
    tf_idf = extract_weights()
    model, df = model_train(tf_idf)
    metrics = clust_metrics(df, df['cluster'])
    return metrics

def perform_classification():
    close_window()
    display_metrics("Classification")

def perform_clustering():
    close_window()
    display_metrics("Clustering")

def close_window():
    root.withdraw()

def display_metrics(algorithm):
    close_window()

    if algorithm == "Classification":
        metrics = classification_train_and_eval()
    elif algorithm == "Clustering":
        metrics = clustering_train_and_eval()

    new_window = tk.Tk()
    new_window.title(f"{algorithm} Metrics")

    # Display metrics in the new window
    row = 0
    for metric_name, metric_value in metrics.items():
        ttk.Label(new_window, text=f"{metric_name}: {metric_value}").grid(row=row, column=0, padx=5, pady=2, sticky="w")
        row += 1

    # Add an exit button
    ttk.Button(new_window, text="Exit", command=new_window.destroy).grid(row=row, column=0, padx=5, pady=5)
    root.destroy()

    new_window.mainloop()


root = tk.Tk()
root.title("Text Classification and Clustering")

algorithm_frame = ttk.Frame(root)
algorithm_frame.pack(padx=10, pady=10)

algorithm_label = ttk.Label(algorithm_frame, text="Select Algorithm:")
algorithm_label.grid(row=0, column=0, padx=5, pady=5, sticky="w")
algorithm_selection = ttk.Combobox(algorithm_frame, values=["Classification", "Clustering"])
algorithm_selection.grid(row=0, column=1, padx=5, pady=5)
algorithm_selection.set("Classification")  # Set default selection

perform_button = ttk.Button(algorithm_frame, text="Perform", command=lambda: perform_task(algorithm_selection.get()))
perform_button.grid(row=1, column=1, padx=5, pady=5)

def perform_task(selected_algorithm):
    if selected_algorithm == "Classification":
        perform_classification()
    elif selected_algorithm == "Clustering":
        perform_clustering()

root.mainloop()