# Compute token count distribution

---

* Load data
* Load relevant packages

---

In [None]:
import pandas as pd

# Load the dataset
file_name = "extracted_questions.csv"
data = pd.read_csv(file_name)

# Progress Check-In
print("\n=== Initial Data Summary ===")
print(f"Dataset: {len(data)} rows, columns: {data.columns.tolist()}")

# Check the first few rows to confirm structure
print(data.head())

In [None]:
# Load spacy
!pip install spacy

# Install German language model
!python -m spacy download de_core_news_sm

---

* Create progress check-in
* Create extraction of a sample set for review after each step

---

In [None]:
# Logging functions for progress check-ins
def log_data_summary(data, step_name):
    print(f"\n=== Summary After Step: {step_name} ===")
    print(f"Number of rows: {len(data)}")
    print(f"Number of duplicate rows (based on 'question_text'): {data.duplicated(subset='question_text').sum()}")
    print(f"Number of empty rows in 'question_text': {data['question_text'].isnull().sum()}")
    print(f"Sample of 'question_text':\n{data['question_text'].head(5)}")

def save_sample(data, step_name, sample_size=50):
    sample = data.sample(sample_size, random_state=42)
    sample.to_csv(f'sample_after_{step_name}.csv', index=False, encoding='utf-8')
    print(f"Sample saved for step: {step_name}")

# Check-in
log_data_summary(data, "Initial Load")
save_sample(data, "initial_load")

---
* Compute token counts throughout data set

---

In [None]:
import pandas as pd
import spacy
from tqdm import tqdm

# Load German spaCy model
nlp = spacy.load("de_core_news_sm")

# Function to calculate token count
def calculate_token_count(question):
    doc = nlp(question)  # Process the question with spaCy
    tokens = [token.text for token in doc if token.is_alpha or token.is_digit]  # Include words and numbers
    return len(tokens), tokens

# Add progress bar
tqdm.pandas(desc="Processing Token Count")

# Apply tokenization and token count
data['token_count'], tokens = zip(*data['question_text'].progress_apply(calculate_token_count))

# Save token count only for the main dataset
data_for_processing = data[['question_id', 'question_id_individual', 'question_text', 'token_count']]
data_for_processing.to_csv("questions_for_processing.csv", index=False)
print("Main dataset saved to 'questions_for_processing.csv'.")

# Save a sample with tokens for quality management
sample_with_tokens = data.sample(n=1000, random_state=42)
sample_with_tokens['tokens'] = sample_with_tokens.index.map(lambda idx: tokens[idx])
sample_with_tokens.to_csv("sample_with_tokens.csv", index=False)
print("Sample with tokens saved to 'sample_with_tokens.csv'.")

# Check-in
log_data_summary(data_for_processing, "Initial Load")
save_sample(data_for_processing, "initial_load")

---
* Analyze token count distribution
* Generate summary statistics for token counts and visualise the distribution

---

In [None]:
import seaborn as sns
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import pandas as pd

# Read the data
data = pd.read_csv("questions_for_processing.csv")
token_counts = data['token_count']

# Specify the path to font file
font_path = 'lmroman10-regular.otf'
fm.fontManager.addfont(font_path)

# Set the font family globally using the font's name
plt.rcParams['font.family'] = 'Latin Modern Roman'

# Choose a specific color from the "colorblind" palette
my_color = sns.color_palette("colorblind")[8]

# Dynamic x-ticks: Determine x tick positions based on token_counts
max_ticks = 12  # Maximum number of ticks
x_values = token_counts.value_counts().sort_index().index  # Sorted token count values
step = max(1, len(x_values) // max_ticks)
x_ticks = range(0, x_values.max() + 1, step)

# Create the boxplot
fig, ax = plt.subplots(figsize=(12, 6))
sns.boxplot(x=token_counts, ax=ax)

# Set x-ticks
ax.set_xticks(x_ticks)
ax.set_xticklabels([str(i) for i in x_ticks], rotation=45, ha="right", fontsize=16)

# Ensure y-ticks are visible and set their font size (for a horizontal boxplot, there may be only one category)
ax.tick_params(axis='y', labelsize=16)

# Remove lines on top and on the right
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Set axis labels
ax.set_xlabel("Token Count", fontsize=20, labelpad=20)
ax.set_ylabel("Frequency", fontsize=20, labelpad=20)

# Add vertical lines at each x-tick
for xtick in x_ticks:
    ax.axvline(x=xtick, color="gray", linestyle="--", linewidth=0.5, alpha=0.7)

# Add horizontal grid lines
ax.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

# Add a horizontal line at y = 0
ax.axhline(y=0, color="gray", linestyle="--", linewidth=0.5)


# Pass color to boxplot explicitly:
sns.boxplot(x=token_counts, color=my_color, ax=ax)

plt.tight_layout()
plt.show()

# Save figure
fig.savefig("TokenCountDistribution.pdf")

---
* Define which token count range to include in analysis based on mean and standard deviation

---

In [None]:
# Define token count range (mean ± standard deviation)
mean = token_count_stats['mean']
std_dev = token_count_stats['std']

lower_bound = max(1, int(mean - std_dev))
upper_bound = int(mean + std_dev)

print(f"\nRecommended Token Count Range: {lower_bound} to {upper_bound}")

---


* Additional token count analysis

---

In [None]:
import pandas as pd
from google.colab import data_table

# Load the dataset
file_path = "questions_for_processing.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)

# Check if the 'token_count' column exists
if "token_count" not in data.columns:
    raise ValueError("The dataset does not contain the 'token_count' column.")

# Calculate the distribution of token counts
token_count_distribution = data['token_count'].value_counts().sort_index()

# Total number of rows for percentage calculation
total_rows = len(data)

# Create a DataFrame with occurrences and percentages
token_count_overview = pd.DataFrame({
    "Token Count": token_count_distribution.index,
    "Occurrences": token_count_distribution.values,
    "Percentage": (token_count_distribution / total_rows) * 100
})

# Reset the index for better formatting
token_count_overview.reset_index(drop=True, inplace=True)

# Save the overview to a CSV file
output_file = "token_count_distribution_overview.csv"
token_count_overview.to_csv(output_file, index=False)
print(f"\nToken count distribution overview saved to '{output_file}'.")

# Print the overview
print("\n=== Token Count Distribution Overview ===")
data_table.DataTable(token_count_overview)

---

* Choose size of dataset for labelling (token counts 5-20)

---

In [None]:
# Load the dataset
file_name = "questions_for_processing.csv"
data = pd.read_csv(file_name)

# Function to sample questions by token count
def group_and_sample(data, sample_size_per_group, min_token_count, max_token_count):
    # Filter the data for token counts between min_token_count and max_token_count
    filtered_data = data[(data['token_count'] >= min_token_count) & (data['token_count'] <= max_token_count)]

    # Group by token_count
    grouped = filtered_data.groupby('token_count')
    sampled_questions = []
    for token_count, group in grouped:
        size = min(sample_size_per_group, len(group))
        sampled_questions.append(group.sample(n=size, random_state=42))
    return pd.concat(sampled_questions).reset_index(drop=True)

# Define parameters
sample_size_per_group = 625  # Adjust number
min_token_count = 5
max_token_count = 20

# Use the filtered dataset (filtered_questions should be data)
representative_sample = group_and_sample(data, sample_size_per_group, min_token_count, max_token_count)

print(f"{len(representative_sample)} questions sampled between {min_token_count} and {max_token_count} tokens.")

---
* Save to file

---


In [None]:
# Save the overview to a CSV file
output_file = "representative_sample_10000(5-20).csv"
representative_sample.to_csv(output_file, index=False)
print(f"\nRepresentative sample saved to '{output_file}'.")