# Functions for GitHub Repository: Automated-Paper-Clustering-for-Conference

This notebook contains functions for generating word clouds, loading tracks, generating combinations of papers from the same and different tracks, calculating similarity scores, and tuning hyperparameters.

In the context of this notebook, a "track" refers to a predefined grouping or category of papers. These tracks are often organized based on common themes, topics, or conference sessions. The functions provided here are designed to analyze and compare papers within these tracks, as well as across different tracks.

<pre>

## Function to Generate Word Cloud

This function generates a word cloud for the text of a specified paper.

In [None]:
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def word_cloud(folder_texts, i)

    # Change to the directory where text files are stored
    os.chdir(folder_texts)

    # List all files in the text files directory
    files = os.listdir(folder_texts)

    # Select the i-th file from the list
    file_name = files[i]

    # Open the file in read mode
    with open(file_name, 'r', encoding='utf-8') as file:
        # Read the content of the file and store it in the 'text' variable
        text = file.read()

    # Create a WordCloud based on the processed text
    wordcloud = WordCloud().generate(text)

    # Display the WordCloud in a graphical representation
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")  # Turn off the axes for a cleaner presentation
    plt.show()  # Show the graphical representation of the WordCloud

### Track Analysis Functions

These functions are designed to analyze tracks of papers.

These functions perform various operations such as loading tracks from files, generating combinations of papers within the same track, generating combinations of papers from different tracks, calculating similarity scores between papers within and across tracks, and computing metrics based on these similarities.

In [None]:
def load_track(folder_tracks, i, name):
    """
    Load a track from a file.

    Args:
    - folder_tracks (str): Path to the folder containing track files.
    - i (int): Index of the track.
    - name (str): Name of the track file.

    Returns:
    - Loaded track.
    """
    os.chdir(folder_tracks)
    with open(name, "rb") as file:
        return pickle.load(file)

def combinations_same(folder_tracks):
    """
    Generate all combinations of pairs of papers within the same track.

    Args:
    - folder_tracks (str): Path to the folder containing track files.

    Returns:
    - List of all combinations of pairs of papers within the same track.
    """
    comb_total = []
    os.chdir(folder_tracks) 
    names_tracks = os.listdir(folder_tracks)
    for i, name in enumerate(names_tracks):
        track = load_track(folder_tracks, i, name)
        comb = list(itertools.combinations(track, 2))
        comb_total.extend(comb)
    return comb_total

pairs_tracks = combinations_same(r'C:\Users\ana_s\OneDrive\Escritorio\tfg\archivos\tracks')

def combinations_diff(folder_tracks, papers):
    """
    Generate all combinations of pairs of papers from different tracks.

    Args:
    - folder_tracks (str): Path to the folder containing track files.
    - papers (list): List of all paper names in the corpus.

    Returns:
    - List of all combinations of pairs of papers from different tracks.
    """
    comb_total = []
    os.chdir(folder_tracks) 
    names_tracks = os.listdir(folder_tracks)
    for i, name in enumerate(names_tracks):
        track = load_track(folder_tracks, i, name)
        for j in track:
            for paper in papers:
                k = papers.index(paper)
                if k not in track:
                    comb_total.append((j, k))
    return comb_total

# Assuming 'papers' is defined somewhere before calling combinations_diff
papers = [...]  # List of all paper names in the corpus
pairs_diff_track = combinations_diff(r'C:\Users\ana_s\OneDrive\Escritorio\tfg\archivos\tracks', papers)

def same_track(matrix_sim):
    """
    Calculate similarity between papers from the same track.

    Args:
    - matrix_sim: Similarity matrix.

    Returns:
    - List of similarities between papers from the same track.
    """
    sim_track = []
    for pair in pairs_tracks:
        i, j = pair
        sim_track.append(matrix_sim[i][j])
    return sim_track

def diff_track(matrix_sim):
    """
    Calculate similarity between papers from different tracks.

    Args:
    - matrix_sim: Similarity matrix.

    Returns:
    - List of similarities between papers from different tracks.
    """
    sim_no_track = []
    for pair in pairs_diff_track:
        i, j = pair
        sim_no_track.append(matrix_sim[i][j])
    return sim_no_track

def metric(matrix_sim):
    """
    Calculate the metric based on similarity between papers
    from the same and different tracks.

    Args:
    - matrix_sim: Similarity matrix.

    Returns:
    - Metric score.
    """
    # Same track
    sim_track = same_track(matrix_sim)
    median_max = statistics.median(sim_track)

    # Different track
    sim_no_track = diff_track(matrix_sim)
    median_min = statistics.median(sim_no_track)

    score = median_max - median_min
    return score

### Function for Hyperparameter Tuning

This function performs hyperparameter tuning by iterating through all parameter combinations and calculating their scores.

In [None]:
import numpy as np
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
import time

# Preprocessing parameters
root_functions = [stem, lemmatize]

# TfidfVectorizer parameters
max_df_values = np.arange(0.7, 1.01, 0.1)  # Ignore terms that appear in more than 'max_df' documents: [0.8:1] (80%)
min_df_values = np.arange(0.0, 0.4, 0.1)  # Ignore terms that appear in less than 'min_df' documents: [0:0.2] (20%)
ngram_ranges = [(1, 1), (1, 2)]  # Token: word / word and bi-gram (consecutive word pairs)
max_features_values = [20, 50, 200, 500]  # Take the top 'max_features' terms that are most frequent in the corpus
sublinear_tf_values = [True, False]  # Apply logarithm to the frequency of each token in the corpus

# Hyperparameter combinations
combinations = list(product(root_functions, max_df_values, min_df_values, max_features_values, ngram_ranges, 
                            sublinear_tf_values))

# Calculate the score for each combination of parameters
# Input: Parameter combination
# Output: Score, hyperparameters, vector with similarities between papers of the same track,
#and vector with similarities between papers of different tracks
def tuning(combination, folder_texts):
    # Preprocessing
    parameters = {'root_function': combination[0]}
    corpus = txt_corpus(folder_texts, combination[0])

    # Tfidf
    parameters_tfidf = {'max_df': combination[1],
                        'min_df': combination[2],
                        'stop_words': 'english',
                        'max_features': combination[3],
                        'sublinear_tf': combination[5],
                        'ngram_range': combination[4]}
    matrix_tfidf = tfidf(corpus, parameters_tfidf)
    parameters.update(parameters_tfidf)

    # Cosine similarity
    matrix_sim = cosine_similarity(matrix_tfidf)
    score = metric(matrix_sim)

    return score, parameters

# Iterate through all parameter combinations and calculate their score
start_time = time.time()
i = 0
scores = []
hyperparameters = []

while True:
    combination = combinations[i]
    score, parameters = tuning(combination, r'C:\Users\ana_s\OneDrive\Escritorio\tfg\archivos\texts_extract')
    hyperparameters.append(parameters)  # List with parameters for each iteration
    scores.append(score)  # List with the score for each iteration

    i += 1
    if i == len(combinations):
        break
elapsed_time = time.time() - start_time
max_score = max(scores)  # Choose the maximum score
hyperparameters_max = hyperparameters[scores.index(max_score)]  # Choose the parameters that achieve the maximum score

### Function to Plot Scores

This function plots the scores corresponding to each hyperparameter value.

In [None]:
def scores_plot(scores, combinations):
    import matplotlib.pyplot as plt

    # Create a dictionary to store scores corresponding to each hyperparameter value
    scores_dict = {}

    # List to store combination information corresponding to each score
    combination_info = []

    # Fill the dictionary and combination information
    for combination, score in zip(combinations, scores):
        combination_info.append((combination, score))
        for hyperparam_value in combination:
            # Convert to a string if it is a boolean
            if isinstance(hyperparam_value, bool):
                hyperparam_value = str(hyperparam_value)
            if hyperparam_value not in scores_dict:
                scores_dict[hyperparam_value] = [score]
            else:
                scores_dict[hyperparam_value].append(score)

    # Create subplots
    fig, axs = plt.subplots(nrows=len(scores_dict), figsize=(5, 4 * len(scores_dict)))

    # Iterate over the dictionary and plot on different subplots
    for i, (hyperparam_value, hyperparam_scores) in enumerate(scores_dict.items()):
        axs[i].plot(hyperparam_scores, label=f'{hyperparam_value}')
        axs[i].legend()

        # Find the top 5 values
        max_indices = sorted(range(len(hyperparam_scores)), key=lambda j: hyperparam_scores[j], reverse=True)[:5]

        # Highlight the top 5 values with a marker or label
        axs[i].scatter(max_indices, [hyperparam_scores[idx] for idx in max_indices], color='red', marker='o', label='Maximums')

        # Draw a horizontal line at the mean
        mean_score = sum(hyperparam_scores) / len(hyperparam_scores)
        axs[i].axhline(y=mean_score, color='green', linestyle='--', label='Mean')
        axs[i].legend()

    # Adjust layout
    plt.tight_layout()

    # Set manual limits to ensure consistency in x and y axes
    for ax in axs:
        ax.set_ylim(-0.051, 0.02)

    plt.show()

    return scores_dict, combination_info

## Function to Calculate Similarity Scores
This function calculates similarity scores between predicted groups and true groups.

In [None]:
def calculate_similarity_scores(predicted_groups, true_groups):
    # List to store final similarity scores.
    final_scores = []

    # Iterate through each predicted group.
    for predicted_group in predicted_groups:
        total_score = 0  # Initialize total score for the predicted group.

        # Iterate through each element in the predicted group.
        for predicted_element in predicted_group:
            matches = 0  # Initialize count of matching elements.
            len_predicted = len(predicted_group)  # Length of predicted group.

            # Iterate through each true group.
            for true_group in true_groups:
                if predicted_element in true_group:
                    len_true = len(true_group)  # Length of true group.

                    # Iterate through each element in the true group.
                    for true_element in true_group:
                        if true_element in predicted_group and true_element != predicted_element:
                            matches += 1  # Increment matches count.

            # Calculate similarity score for the current predicted element.
            element_score = matches / (len_true - 1) if len_true > 1 else 0
            total_score += element_score  # Add element score to total score.

        # Calculate final similarity score for the predicted group.
        final_score = total_score / len_predicted if len_predicted > 0 else 0
        final_scores.append(final_score)  # Add final score to the list.

    # Print final scores and average score.
    print('Final Scores:', final_scores)
    print('Average Score:', statistics.mean(final_scores))
    
calculate_similarity_scores(kmeans.labels_, tracks)