#  Functions for GitHub Repository: Automated-Paper-Clustering-for-Conferences

This notebook contains code for extracting, preprocessing, clustering, and visualizing papers from conference proceedings.


<pre>

## Function to Write Texts Extracted from Papers

This function extracts text from PDF files and writes them into text files.

In [None]:
import os
from os import mkdir
from pdfminer.high_level import extract_text  # Ensure pdfminer is installed

# Function to write texts extracted from papers into text files
def write_texts(papers, folder_papers, folder_texts):
    problem_files = []  # List to store names of problematic files
    mkdir(folder_texts)  # Create folder to store text files
    
    for paper in papers:
        try:
            os.chdir(folder_papers)  # Switch to the directory where PDF documents are located
            text = extract_text(paper)  # Use pdfminer to extract text from the PDF
            file_name = paper + '.txt'
            
            os.chdir(folder_texts)  # Switch to the directory where text files will be stored
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(text)
                file.close()
        except Exception as e:
            print(f"Error processing file {paper}: {str(e)}")
            problem_files.append(paper)  # Add problematic file to the list of files with issues
    
    return problem_files  # Return the list of problematic files at the end of the process

## Function to Extract Paper Titles


This function extracts paper titles from text files.

In [None]:
# Function to extract paper titles from text files in a specified folder.
def papers_names(folder_texts):
    
    files_names = [] # Initialize an empty list to store the titles of the papers
    files = os.listdir(folder_texts) # List all files in the specified folder

    # Iterate through each file in the folder
    for file_name in files:
        with open(os.path.join(folder_texts, file_name), 'r', encoding='utf-8') as file:
            text = file.read()    
            title_end_index = text.find("\n\n") # Find the index of the first occurrence of "\n\n" 
                                                # which indicates the end of the title
            title = text[:title_end_index].replace("\n", "").strip() # Extract the title from the
            # remove any newline characters, and strip any leading or trailing whitespace
            
            # Add the extracted title to the list of paper titles
            files_names.append(title)
    
    return files_names

## Text Preprocessing Functions


These functions preprocess the extracted text by removing noise and applying lemmatization.

In [None]:
# Text preprocessing function
# Returns False for words with characters that are not letters from the alphabet a-z

def abc(w):
    x = True
    for char in w:
        if ord(char) not in range(97, 123):
            x = False
    return x

In [None]:
import string
from string import digits

# Text preprocessing: removes (cid:nº), 'fig' and applies abc
# The lines of the function are in this order to be able to remove 'cid' 
#even though transformers remove numbers, punctuation, and uppercase letters

def prep_text(text):
    text = text.lower()  # Convert all text to lowercase
    text = text.translate(str.maketrans('', '', digits))  # Remove numbers
    text = text.replace('(cid:)', '')  # 'CID' is an identifier
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join(word for word in text.split() if word != 'fig' and abc(word) == True)  
        # Apply the abc custom function to each word
    return text

In [None]:
import nltk

# nltk.download('all') # Download all NLTK resources only if it's the first time importing NLTK in this environment
                       # comment this line otherwise

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Function that maps NLTK POS tags to WordNet POS tags
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN

# Function for lemmatizing text (reducing words to their base form)
def lemat(text):
    lema = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)  # Tokenize the text (split all words and add them to a list in order)
    tagged_tokens = pos_tag(tokens)  # List of tuples (tuple = (token, tag)), tag: type of word (noun, adjective, etc.)
    words_lema = []
    for token, tag in tagged_tokens:
        words_lema.append(lema.lemmatize(token, pos_tagger(tag)))  # Lemmatize each token indicating its POS tag
    text = ' '.join(word for word in words_lema)    
    return text

## Function to Create Corpus


This function preprocesses the text and creates a corpus.

In [None]:
# Function to preprocess and create a corpus using either stemming or lemmatization
def txt_corpus(folder_texts, root_function):
    os.chdir(folder_texts)
    corpus = []
    files = os.listdir(folder_texts)  # List of files in the specified folder
    for file_name in files:
        with open(file_name, 'r', encoding='utf-8') as file:
            text = file.read()
            file.close()
        text = prep_text(text)  # Apply the prep_text function
        text = root_function(text)  # Apply either stemming or lemmatization
        corpus.append(text)  # Add the preprocessed paper to the corpus vector
    return corpus

## Function to Convert Corpus into TF-IDF Matrix


This function converts the corpus into a TF-IDF matrix.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the corpus into a TF-IDF matrix
# The values in this matrix represent the importance of each word in each text with respect to the corpus

def tfidf(corpus, parameters):
    transformer = TfidfVectorizer(**parameters)
    matrix_tfidf = transformer.fit_transform(corpus)
    return matrix_tfidf

## Function to Plot Cosine Similarity Matrix


This function calculates and plots the cosine similarity matrix.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

def plot_cosine_similarity(matrix_tfidf):
    # Calculate cosine similarity matrix
    matrix_sim = cosine_similarity(matrix_tfidf)

    # Plot the similarity matrix
    f, ax = plt.subplots(figsize=(50, 50))  # Determine the dimensions of the figure
    sns.heatmap(matrix_sim, annot=True, linewidths=.5, fmt='.2f', cmap="YlGnBu")
    plt.show()

    # Return the cosine similarity matrix
    return matrix_sim

## Function for Elbow Method


This function uses KMeans on the similarity matrix and plots Within-Cluster Sum of Squares (WCSS) to find optimal clusters.

In [None]:
#elbow_method uses KMeans on similarity matrix, plots WCSS to find optimal clusters for given data.

def elbow_method(matrix_sim, n_clusters):
    wcss = []
    cl_num = n_clusters + 1
    for i in range(1, cl_num):
        kmeans = KMeans(i)
        kmeans.fit(matrix_sim)
        wcss_iter = kmeans.inertia_
        wcss.append(wcss_iter)

    number_clusters = range(1, cl_num)
    plt.plot(number_clusters, wcss)
    plt.title('The Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('Within-cluster Sum of Squares')
    plt.show()

## Function for Clustering


This function performs clustering and visualizes the results.

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import csv

def clustering(matrix_sim, n_clusters, papers_titles, max_elements_per_cluster, folder_clusters):
    
    num_papers = len(papers_titles)
    
    if n_clusters >= num_papers:
        n_clusters = num_papers - 1
        print("Warning: Number of clusters is high. Adjusting to maximum possible value:", n_clusters)
        
    if max_elements_per_cluster < (num_papers/ n_clusters):
        max_elements_per_cluster = int(num_papers/ n_clusters) + 1
        print("Warning: Maximum elements per cluster is low. Adjusting to minimum possible value:", max_elements_per_cluster)

    # Execute clustering
    kmeans = KMeans(n_clusters=n_clusters).fit(matrix_sim)
    
    # Get labels and centroids
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    
    # Dictionary to store indices selected by cluster
    selected_indices_by_cluster = {}

    for i in range(n_clusters):
        # Find indices of points assigned to cluster i
        cluster_indices = np.where(labels == i)[0]

        # Check if there are enough points in the cluster to select
        num_points_in_cluster = len(cluster_indices)

        if num_points_in_cluster > max_elements_per_cluster:
            # Calculate distance from each point to cluster centroid
            distances = np.linalg.norm(matrix_sim[cluster_indices] - centroids[i], axis=1)

            # Sort indices of points based on distance to centroid
            sorted_indices = cluster_indices[np.argsort(distances)]

            # Select the first max_elements_per_cluster points and store them
            selected_indices = sorted_indices[:max_elements_per_cluster]

            # Store selected indices in dictionary
            selected_indices_by_cluster[i] = selected_indices
        else:
            # If there are fewer points than max_elements_per_cluster, select all available points
            selected_indices_by_cluster[i] = cluster_indices

    # Assign remaining indices to nearest clusters
    unassigned_indices = [idx for idx in range(len(matrix_sim)) if idx not in np.concatenate(list(selected_indices_by_cluster.values()))]

    for idx in unassigned_indices:
        distances = [np.linalg.norm(matrix_sim[idx] - centroids[j]) for j in range(n_clusters)]
        # Find the closest cluster that has not reached its capacity limit yet
        closest_cluster = np.argmin(distances)
        # Look for the next available cluster if the closest cluster is full
        for j in range(1, n_clusters):
            next_cluster = (closest_cluster + j) % n_clusters
            if len(selected_indices_by_cluster[next_cluster]) < max_elements_per_cluster:
                selected_indices_by_cluster[next_cluster] = np.append(selected_indices_by_cluster[next_cluster], idx)
                break

    # Transform the dictionary to a labels array for next steps
    values_keys = [(value, key) for key, values in selected_indices_by_cluster.items() for value in values]
    labels = [key for value, key in values_keys]
    values_keys.sort()
    labels = [key for value, key in values_keys]

    # Visualize the results
    plt.figure(figsize=(5, 5))  # Set the figure size
    clusters_visual = plt.scatter(matrix_sim[:, 0], matrix_sim[:, 1], c=labels, s=50, cmap='rainbow')
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, c=range(len(kmeans.cluster_centers_)), cmap='rainbow')
    plt.colorbar(clusters_visual)
    plt.show()

    # Silhouette score to evaluate clustering
    silhouette = silhouette_score(matrix_sim, labels)
    print('Silhouette score =', silhouette)
    
     # Generating unique CSV file name based on number of clusters and max elements per cluster
    csv_name = f'clusters_papers_{n_clusters}_{max_elements_per_cluster}.csv'
   
    # Writing cluster assignments to CSV
    os.chdir(folder_clusters)
    with open(csv_name, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Paper Title', 'Cluster'])
        for i, cluster in enumerate(labels):
            writer.writerow([papers_titles[i], cluster])
            
    # Return the plot and silhouette score      
    return clusters_visual, silhouette

<pre>

## Main Execution


The main execution section prompts the user for input paths, performs text extraction, preprocessing, clustering, and visualization.

In [None]:
# User inputs
folder_papers = input(r"Please provide the path to the papers folder: ")
folder_texts = input(r"Please provide the path to save the extracted texts: ")
folder_clusters = input(r"Please provide the path to save the clusters of paper titles:")

In [None]:
#The letter 'r' is added to the beginning of the paths entered by the user, ensuring that they are treated as raw strings
folder_papers = r"" + folder_papers
folder_texts = r"" + folder_texts

# Extract texts from papers and write them to text files
# Notify about problematic PDFs
papers = os.listdir(folder_papers)
problems = write_texts(papers, folder_papers, folder_texts)
if problems:
    print("Problematic files:")
    for problem in problems:
        print(problem)
        
num_pdfs = len(papers)
num_texts = len(papers) - len(problems)
num_problems = len(problems)
print(f"The number of PDFs is {num_pdfs}, the number of extracted texts is {num_texts}, and the number of problematic PDFs is {num_problems}.")

In [None]:
# Create a list of the papers titles
papers_titles = papers_names(folder_texts)

# Create a corpus from the extracted texts
corpus = txt_corpus(folder_texts, lemat)

# Transform the corpus using TF-IDF vectorizer
parameters = {'max_df' : 0.7, 'min_df' : 0.2, 'stop_words' : 'english', 'max_features' : 50, 'sublinear_tf' : True}
matrix_tfidf = tfidf(corpus, parameters)

# Calculate and plot cosine similarity matrix
matrix_sim = plot_cosine_similarity(matrix_tfidf)

In [None]:
# Input for the number of clusters
n_clusters = int(input("Please enter the desired number of clusters: "))

In [None]:
#elbow_method uses KMeans on similarity matrix, plots WCSS to find optimal clusters for given data.
elbow_method(matrix_sim, n_clusters)

In [None]:
mkdir(folder_clusters) # Create folder to store cluster files

while True:
    # Input for the number of clusters
    n_clusters = int(input("Please enter the desired number of clusters: "))

    # Input for the maximum number of elements per cluster
    max_elements_per_cluster = int(input("Please enter the maximum number of elements per cluster: "))

    clusters_visual, silhouette= clustering(matrix_sim, n_clusters, papers_titles, max_elements_per_cluster, folder_clusters)

    # Ask the user if they want to try another number of clusters and maximum number of elements.
    another_try = input("Do you want to try another number of clusters and maximum number of elements? (y/n): ").lower()
    
    if another_try != 'y':
        break