# Exploratory Data Analysis

## 0. Environment set up

In [None]:
# Switch to home project directory
%cd ../..

In [None]:
import json
import os

import pandas as pd
import numpy as np

import spacy

import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr, kendalltau

from bert_score import BERTScorer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import logging
logging.set_verbosity_error()

from tqdm.notebook import tqdm

from textstat import flesch_reading_ease

from src.unite_talking_points.utils.config.config_loader import ConfigLoader

In [None]:
config = ConfigLoader().load_config(current_directory_is_root=True)

## 1. Data loading

In [None]:
def load_json_data(data_dir):
    """
    Load JSON data from the specified directory into a pandas DataFrame.

    Args:
    - data_dir (str): Path to the directory containing JSON files.

    Returns:
    - df (pd.DataFrame): DataFrame containing the loaded JSON data.
    """
    # Initialize empty lists to store data
    file_names = []
    labels = []
    document_names = []
    meeting_names = []
    meeting_dates = []
    contents = []
    prompts = []

    # Iterate over each JSON file in the directory
    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            with open(os.path.join(data_dir, filename), 'r') as file:
                data = json.load(file)
                # Extract data from each JSON file and append to lists
                file_names.append(filename)
                labels.append(data['label'])
                document_names.append(data['document_name'])
                meeting_names.append(data['meeting_name'])
                meeting_dates.append(data['meeting_date'])
                contents.append(data['content'])
                prompts.append(data['prompt'])

    # Create a DataFrame from the lists
    df = pd.DataFrame({
        'file_name': file_names,
        'label': labels,
        'document_name': document_names,
        'meeting_name': meeting_names,
        'meeting_date': meeting_dates,
        'content': contents,
        'prompt': prompts
    })

    return df

In [None]:
df = load_json_data(config['Directories']['raw_data_path'])

In [None]:
df

## 2. Basics statistics

In [None]:
# Number of documents
num_documents = len(df)

# Distribution of labels
label_counts = df['label'].value_counts()

# Distribution of meeting names
meeting_counts = df['meeting_name'].value_counts()

# Display basic statistics
print("Basic Statistics:")
print("Number of documents:", num_documents)
print("\nLabel Distribution:")
print(label_counts)
print("\nMeeting Name Distribution:")
print(meeting_counts)

## 3. Data Preprocessing

In [None]:
# Load the English language model
nlp = spacy.load('en_core_web_trf')

In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Parse the text with spaCy
    doc = nlp(text)
    
    # Tokenization, removing stop words, punctuation, and lemmatization
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [None]:
# Apply the preprocessing function to the 'content' column
df['preprocessed_content'] = df['content'].apply(preprocess_text)

# Display the preprocessed content
print("Preprocessed Content:")
print(df['preprocessed_content'].head())

In [None]:
df['meeting_date'] = pd.to_datetime(df['meeting_date'], format='%d-%m-%Y')

## 4. Word Frequency Analysis

In [None]:
# Function to calculate word frequencies
def calculate_word_frequencies(text):
    # Split the text into words
    words = text.split()
    
    # Create a pandas Series from the list of words
    word_series = pd.Series(words)
    
    # Count the occurrences of each word
    word_counts = word_series.value_counts()

    return word_counts

In [None]:
# Concatenate preprocessed content for all labels
all_concatenated_text = ' '.join(df['preprocessed_content'])

# Calculate word frequencies for all labels
all_word_frequencies = calculate_word_frequencies(all_concatenated_text)

# Create a figure and axes for subplots
fig, axes = plt.subplots(1, 1, figsize=(10, 6))

# Plot word frequency analysis for all labels using Seaborn
sns.barplot(x=all_word_frequencies.head(20).values, y=all_word_frequencies.head(20).index, ax=axes)
axes.set_title('Análisis frecuencial de palabras global')
axes.set_xlabel('Frecuencia')
axes.set_ylabel('Palabra')

plt.tight_layout()
plt.savefig('img/frecuencia_global.png')
plt.show()

In [None]:
# Calculate word frequencies for each label
word_frequencies_by_label = {}
for label in df['label'].unique():
    # Filter the DataFrame by label
    subset_df = df[df['label'] == label]
    
    # Concatenate preprocessed content of all documents for the label
    concatenated_text = ' '.join(subset_df['preprocessed_content'])
    
    # Calculate word frequencies
    word_frequencies_by_label[label] = calculate_word_frequencies(concatenated_text)

# Create a figure and axes for subplots
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# Flatten the axes array to facilitate iteration
axes = axes.flatten()

# Plot word frequency analysis for each label using Seaborn
n = 10
for i, (label, word_freq) in enumerate(word_frequencies_by_label.items()):
    # Plot on the appropriate subplot
    sns.barplot(x=word_freq.head(n).values, y=word_freq.head(n).index, ax=axes[i])
    axes[i].set_title(f'{label}')
    axes[i].set_xlabel('Palabra')
    axes[i].set_ylabel('Frecuencia')
    axes[i].tick_params(axis='x', rotation=45)

# Hide empty subplots
for j in range(len(word_frequencies_by_label), len(axes)):
    fig.delaxes(axes[j])

plt.suptitle('Análisis frecuencial de palabras por etiquetas temáticas', fontsize=24)
plt.tight_layout()
plt.savefig('img/frecuencia_local.png')
plt.show()

## 5. N-gram Analysis

In [None]:
# Function to calculate n-gram frequencies
def calculate_ngram_frequencies(text, n):
    # Initialize CountVectorizer to extract n-grams
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')
    
    # Fit and transform the text to extract n-grams
    ngrams = vectorizer.fit_transform([text])
    
    # Get the feature names (n-grams)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get the counts of each n-gram
    ngram_counts = ngrams.toarray().flatten()
    
    # Create a pandas Series from the counts with n-gram names as index
    ngram_freq = pd.Series(ngram_counts, index=feature_names).sort_values(ascending=False)
    
    return ngram_freq

# Function to plot n-gram analysis
def plot_ngram_analysis(ngram_freq, n, label=''):
    plt.figure(figsize=(10, 6))
    ngram_freq.head(20).plot(kind='barh')
    if label:
        plt.title(f'{label}')
    else:
        plt.title(f'Análisis frecuencial global de bigramas')
    plt.xlabel(f'Frecuencia')
    plt.ylabel('Bigrama')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('img/bigramas_global.png')
    plt.show()

In [None]:
concatenated_text = ' '.join(df['preprocessed_content'])
ngram_freq = calculate_ngram_frequencies(concatenated_text, n=2)

# Plot N-gram analysis
plot_ngram_analysis(ngram_freq, n=2)
plt.savefig('bigramas_global.png')

In [None]:
# Perform N-gram analysis for each label
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# Flatten the axes array to facilitate iteration
axes = axes.flatten()

for idx, label in enumerate(df['label'].unique()):
    # Filter the DataFrame by label
    subset_df = df[df['label'] == label]
    
    # Concatenate preprocessed content of all documents for the label
    concatenated_text = ' '.join(subset_df['preprocessed_content'])
    
    # Calculate N-gram frequencies for bi-grams (n=2)
    ngram_freq = calculate_ngram_frequencies(concatenated_text, n=2)
    
    # Plot N-gram analysis with rotated orientation on the appropriate subplot
    ax = axes[idx]
    ngram_freq.head(10).sort_values().plot(kind='barh', ax=ax)
    ax.set_title(f'{label}')
    ax.set_xlabel('Frecuencia')
    ax.set_ylabel('Bigrama')
    ax.invert_yaxis()  # Invert y-axis to have highest frequency at the top

# Hide empty subplots
for j in range(len(df['label'].unique()), len(axes)):
    fig.delaxes(axes[j])

plt.suptitle('Análisis frecuencial de bigramas por etiquetas temáticas', fontsize=24)
plt.tight_layout()
plt.savefig('img/bigrama_local.png')
plt.show()


## 6. Named Entity Recognition Analysis

In [None]:
def perform_ner_with_entities(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Initialize defaultdict to count entity occurrences
    entity_counts = defaultdict(int)
    
    # List to store unique entities
    unique_entities = set()
    
    # Iterate over entities in the document
    for ent in doc.ents:
        # Count occurrences of each entity label
        entity_counts[ent.label_] += 1
        
        # Add entity text to unique_entities set
        unique_entities.add((ent.text, ent.label_))
    
    return entity_counts, unique_entities

In [None]:
# Perform NER and extract unique entities for each label
ner_results_by_label = {}
unique_entities_by_label = {}
for label in df['label'].unique():
    # Filter the DataFrame by label
    subset_df = df[df['label'] == label]
    
    # Concatenate preprocessed content of all documents for the label
    concatenated_text = ' '.join(subset_df['preprocessed_content'])
    
    # Perform NER and extract unique entities
    ner_results, unique_entities = perform_ner_with_entities(concatenated_text)
    ner_results_by_label[label] = ner_results
    unique_entities_by_label[label] = unique_entities

# Display NER results and unique entities for each label
for label, ner_results in ner_results_by_label.items():
    #print(f"NER Results for Label: {label}")
    #for ent_label, count in ner_results.items():
    #    print(f"{ent_label}: {count}")
    #print()
    print(f"Unique Entities for Label: {label}")
    for entity, ent_label in unique_entities_by_label[label]:
        print(f"{entity} ({ent_label})")
    print()

## 7. Document Length Distribution

In [None]:
# Calculate document lengths (number of words)
df['document_length'] = df['content'].apply(lambda x: len(x.split()))

fig, ax1 = plt.subplots()
sns.kdeplot(data=df, x="document_length", ax=ax1)

ax1.set_xlim((df["document_length"].min(), df["document_length"].max()))
ax1.set_xlabel('Número de palabras')
ax1.set_ylabel('Densidad')

ax2 = ax1.twinx()
ax2.set_ylabel('Número de documentos')
sns.histplot(data=df, x="document_length", bins=15, ax=ax2)
plt.title('Distribución de la longitud de documentos')
plt.savefig('img/distribucion_longitud.png')
plt.show()

In [None]:
# Plot text length vs. label analysis
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='label', y='document_length')
plt.title('Distribución de la longitud por etiqueta temática')
plt.xlabel('Etiqueta temática')
plt.ylabel('Número de palabras')
plt.xticks(rotation=45)
plt.savefig('img/distribucion_longitud_etiquetas.png')
plt.show()

## 8. Text Similarity Analysis

In [None]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit-transform the preprocessed content to create TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_content'])

# Compute pairwise cosine similarities between documents
tfidf_cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Plot heatmap of cosine similarities
plt.figure(figsize=(20, 8))
sns.heatmap(tfidf_cosine_similarities, cmap='coolwarm', annot=True, fmt=".2f")
plt.title('Similaridad coseno entre documentos', fontsize=18)
plt.xlabel('ID del documento')
plt.ylabel('ID del documento')
plt.savefig('img/similaridad_coseno.png')
plt.show()

In [None]:
# Máscara para excluir la diagonal principal (similaridad de cada documento consigo mismo)
mask = np.ones(tfidf_cosine_similarities.shape, dtype=bool)
np.fill_diagonal(mask, 0)

# Calcula la media de las similitudes de coseno excluyendo la diagonal
mean_cosine_similarity = np.mean(tfidf_cosine_similarities[mask])

print("La media de las similaridades de coseno (excluyendo diagonal) es:", mean_cosine_similarity)

In [None]:
# Get unique labels
unique_labels = df['label'].unique()

# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(20, 10))  # Adjusted figsize for better fit

# Flatten the axes array to facilitate iteration
axes = axes.flatten()

for idx, label in enumerate(unique_labels):
    # Filter the DataFrame by label
    subset_df = df[df['label'] == label]
    
    # Calculate cosine similarities within label subset
    tfidf_matrix_label = tfidf_vectorizer.transform(subset_df['preprocessed_content'])
    tfidf_cosine_similarities_label = cosine_similarity(tfidf_matrix_label, tfidf_matrix_label)
    
    # Plot heatmap
    ax = axes[idx]
    sns.heatmap(tfidf_cosine_similarities_label, ax=ax, cmap='coolwarm', annot=True, fmt=".2f")
    ax.set_title(f'{label}')
    ax.set_xlabel('ID del documento')
    ax.set_ylabel('ID del documento')

plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust the rectangle in tight_layout
plt.suptitle('Similaridad coseno entre documentos por etiqueta temática', fontsize=24, y=0.98)  # Adjust the y position of suptitle
plt.savefig('img/similaridad_coseno_label.png')
plt.show()

In [None]:
# Inicializa una lista para guardar las medias de las similitudes de coseno para cada etiqueta
mean_cosine_similarities_per_label = []

# Itera sobre cada etiqueta única en el DataFrame
for label in df['label'].unique():
    # Filtra el DataFrame por etiqueta
    subset_df = df[df['label'] == label]
    
    # Calcula las similitudes de coseno dentro del subconjunto de la etiqueta
    tfidf_matrix_label = tfidf_vectorizer.transform(subset_df['preprocessed_content'])
    tfidf_cosine_similarities_label = cosine_similarity(tfidf_matrix_label, tfidf_matrix_label)
    
    # Máscara para excluir la diagonal principal
    mask = np.ones(tfidf_cosine_similarities_label.shape, dtype=bool)
    np.fill_diagonal(mask, 0)
    
    # Calcula la media de las similitudes de coseno excluyendo la diagonal
    mean_cosine_similarity = np.mean(tfidf_cosine_similarities_label[mask])
    mean_cosine_similarities_per_label.append(mean_cosine_similarity)

# Calcula la media de las medias de las similitudes de coseno
overall_mean_cosine_similarity = np.mean(mean_cosine_similarities_per_label)

# Imprime las medias de similaridades de coseno para cada etiqueta y la media general
for idx, label in enumerate(df['label'].unique()):
    print(f"Media de similaridad de coseno para la etiqueta '{label}': {mean_cosine_similarities_per_label[idx]}")
print(f"Media general de las medias de similaridades de coseno: {overall_mean_cosine_similarity}")

In [None]:
scorer = BERTScorer(model_type='bert-base-uncased')
bert_scores = np.zeros((len(df), len(df)))

for i in tqdm(range(len(df))):
    for j in tqdm(range(i, len(df)), leave=False):  # Compute half the matrix due to symmetry
        P, R, F1 = scorer.score([df['content'][i]], [df['content'][j]], verbose=False)
        bert_scores[i, j] = F1.item()
        bert_scores[j, i] = F1.item()  # Fill both (i, j) and (j, i)

# Plot heatmap of BERTScore similarities
plt.figure(figsize=(20, 8))
sns.heatmap(bert_scores, cmap='coolwarm', annot=True, fmt=".2f")
plt.title('BERTScore entre documentos', fontsize=18)
plt.xlabel('ID del documento')
plt.ylabel('ID del documento')
plt.savefig('img/bert_score.png')
plt.show()

In [None]:
# Máscara para excluir la diagonal principal (similaridad de cada documento consigo mismo)
mask = np.ones(bert_scores.shape, dtype=bool)
np.fill_diagonal(mask, 0)

# Calcula la media de las similitudes bertscore excluyendo la diagonal
mean_bertscore_similarity = np.mean(bert_scores[mask])

print("La media de las similaridades de coseno (excluyendo diagonal) es:", mean_bertscore_similarity)

In [None]:
# Función para extraer la parte triangular superior de la matriz sin la diagonal
def get_upper_triangle(matrix):
    return matrix[np.triu_indices_from(matrix, k=1)]

# Aplanar las matrices
flat_bert_scores = get_upper_triangle(bert_scores)
flat_cosine_distances = get_upper_triangle(tfidf_cosine_similarities)

# Calcular correlaciones
pearson_corr, _ = pearsonr(flat_bert_scores, flat_cosine_distances)
spearman_corr, _ = spearmanr(flat_bert_scores, flat_cosine_distances)
kendall_corr, _ = kendalltau(flat_bert_scores, flat_cosine_distances)

print("Correlación de Pearson:", pearson_corr)
print("Correlación de Spearman:", spearman_corr)
print("Correlación de Kendall:", kendall_corr)

## 9. Prompt Similarity Analysis

In [None]:
jaccards = []
for doc, prompt in zip(df['preprocessed_content'], df['prompt']):
    doc_vector = tfidf_vectorizer.transform([doc])
    prompt_vector = tfidf_vectorizer.transform([prompt])
    jaccards.append(cosine_similarity(doc_vector, prompt_vector)[0][0])

In [None]:
fig, ax1 = plt.subplots()
sns.kdeplot(jaccards, ax=ax1)
ax1.set_xlim((min(jaccards), max(jaccards)))
ax2 = ax1.twinx()
sns.histplot(jaccards, bins=15, ax=ax2)
plt.title('Content-Prompt Jaccard similarity distribution')
plt.xlabel('Jaccard similarity')
plt.show()

In [None]:
bert_scores = []
for doc, prompt in tqdm(zip(df['content'], df['prompt']), total=len(df)):
    P, R, F1 = scorer.score([doc], [prompt], verbose=False)
    bert_scores.append(F1.data.item())

In [None]:
np.mean(bert_scores)

In [None]:
fig, ax1 = plt.subplots()
sns.kdeplot(bert_scores, ax=ax1)
ax1.set_xlim((min(bert_scores), max(bert_scores)))
ax1.set_xlabel('Número de palabras')
ax1.set_ylabel('Densidad')

ax2 = ax1.twinx()
ax2.set_ylabel('Número de documentos')
sns.histplot(bert_scores, bins=15, ax=ax2)
plt.title('Disitribución de la similaridad entre contenido y prompt.')
plt.savefig('img/bertscores_prompt_content.png')
plt.show()

## 10. Fluidity Analysis

In [None]:
# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Function to calculate perplexity
def calculate_perplexity(text):
    encode = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        loss = model(encode, labels=encode)[0]
    return torch.exp(loss).item()

In [None]:
perplexities = df['content'].apply(lambda x: calculate_perplexity(x))

In [None]:
np.mean(perplexities)

In [None]:
fig, ax1 = plt.subplots()
sns.kdeplot(perplexities, ax=ax1)
ax1.set_xlim((min(perplexities), max(perplexities)))
ax1.set_xlabel('Perplejidad de GPT-2')
ax1.set_ylabel('Densidad')
ax2 = ax1.twinx()
ax2.set_ylabel('Número de documentos')
sns.histplot(perplexities, bins=15, ax=ax2)
plt.title('Distribución de la perplejidad de los documentos')
plt.savefig('img/perplexity.png')
plt.show()

## 11. Readibility Analysis

In [None]:
fleshs = df['content'].apply(lambda x: flesch_reading_ease(x))

In [None]:
np.mean(fleshs)

In [None]:
fig, ax1 = plt.subplots()
sns.kdeplot(fleshs, ax=ax1)
ax1.set_xlim((min(fleshs), max(fleshs)))
ax1.set_xlabel('Puntuación')
ax1.set_ylabel('Densidad')
ax2 = ax1.twinx()
ax2.set_ylabel('Número de documentos')
sns.histplot(fleshs, bins=15, ax=ax2)
plt.title('Flesch Reading Ease de los documentos')
plt.xlabel('Flesh score')
plt.savefig('img/flesch.png')
plt.show()

## 12. Date Analysis

In [None]:
df['meeting_date'].describe()

In [None]:
plt.figure(figsize=(10, 6))
df['meeting_date'].hist(bins=50, alpha=0.7)
plt.title('Distribución de los documentos a través del tiempo')
plt.xlabel('Fecha')
plt.ylabel('Frecuencia')
plt.savefig('img/fechas.png')
plt.show()

In [None]:
# Extracting month and year
df['year'] = df['meeting_date'].dt.year
df['month'] = df['meeting_date'].dt.month

# Grouping by year and month
monthly_counts = df.groupby(['year', 'month']).size().unstack(fill_value=0)

# Plotting
plt.figure(figsize=(12, 8))
monthly_counts.plot(kind='bar', stacked=True)
plt.title('Frequency of Documents by Month and Year')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.legend(title='Month')
plt.show()

In [None]:
# Grouping by date and label
label_distribution = df.groupby([df['meeting_date'].dt.to_period('M'), 'label']).size().unstack(fill_value=0)

# Plotting
plt.figure(figsize=(12, 8))
label_distribution.plot(kind='line', stacked=False)
plt.title('Distribution of Labels Over Time')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.legend(title='Label')
plt.show()

In [None]:
df.iloc[19]

In [None]:
print(df.iloc[25].prompt)

In [None]:
print(df.iloc[25].content)

In [None]:
df

In [None]:
print(df.iloc[22].content)