# Extract questions for manual data annotation

---
* Load data

---

In [None]:
import pandas as pd

# Load the dataset
file_name = "cleaned_dataset.csv"
data = pd.read_csv(file_name)

# Progress Check-In
print("\n=== Initial Data Summary ===")
print(f"Dataset: {len(data)} rows, columns: {data.columns.tolist()}")

# Check the first few rows to confirm structure
print(data.head())

---
* Create progress check-in
* Create extraction of a sample set for review after each step

---

In [None]:
# Logging functions for progress check-ins
def log_data_summary(data, step_name):
    print(f"\n=== Summary After Step: {step_name} ===")
    print(f"Number of rows: {len(data)}")
    print(f"Number of duplicate rows (based on 'question_text'): {data.duplicated(subset='question_text').sum()}")
    print(f"Number of empty rows in 'question_text': {data['question_text'].isnull().sum()}")
    print(f"Sample of 'question_text':\n{data['question_text'].head(5)}")

def save_sample(data, step_name, sample_size=50):
    sample = data.sample(sample_size, random_state=42)
    sample.to_csv(f'sample_after_{step_name}.csv', index=False, encoding='utf-8')
    print(f"Sample saved for step: {step_name}")

# Check-in
log_data_summary(data, "Initial Load")
save_sample(data, "initial_load")

---


* Initialize spaCy for word class detection

---

In [None]:
!python -m spacy download de_core_news_sm

In [None]:
import spacy

# Load SpaCy German language model
nlp = spacy.load("de_core_news_sm")

# Function to check if a word is a noun
def is_noun(word):
    doc = nlp(word)
    return any(token.pos_ == "NOUN" for token in doc)


---
Step 4

* Protect patterns in the text
* Apply rules for handling periods that should not be interpreted as sentence-ending.

---

In [None]:
import re
from tqdm import tqdm

# Initialize tqdm for pandas
tqdm.pandas()

# List of abbreviations
abbreviations = {
    "Abb", "Abk", "Abs", "Abschn", "Adj", "Adv", "afk", "afr", "AJ", "Akk", "allg", "alph", "Alt",
    "Anh", "Anl", "Anm", "Apr", "arab", "Art", "Aufl", "Aug", "Ausg", "Az", "Bd", "BL", "bes", "Bez",
    "BHF", "Bsp", "bspw", "bzgl", "bzw", "ca", "ccm", "cm", "CPU", "Dat", "dazw", "ders", "desgl",
    "Det", "Dez", "dgl", "dh", "Di", "Do", "Dr", "dt", "Dt", "EG", "ehem", "eigntl", "eigtl", "Eings",
    "einschl", "entsp", "erg", "etc", "ETF", "etw", "ev", "evtl", "f", "Feb", "FC", "Fem", "ff", "Fr",
    "Fr", "Frl", "FS", "Fut", "g", "geb", "gegr", "gem", "Gen", "gesch", "gf", "ggf", "ggs", "ggü", "GK",
    "GPU", "grds", "HBF", "HJ", "Hr", "HS", "iA", "idR", "idS", "Imp", "inkl", "insb", "inzw", "iÜ",
    "Jan", "Jh", "Jhdt", "Jhs", "jmd", "jmdm", "jmdn", "jmds", "Jul", "Jun", "jährl", "KA", "Kal",
    "Kap", "kath", "kg", "KH", "km", "kompl", "Konj", "kw", "led", "LAN", "LH", "LJ", "LK", "m",
    "Mai", "Mask", "maW", "max", "mE", "med", "Mi", "min", "Mio", "mM", "mm", "mMn", "Mo", "Mr", "Mrd",
    "Mrs", "Ms", "mtl", "mW", "Mwst", "Mär", "Neutr", "Nom", "Nov", "Nr", "NS", "oa", "og", "Okt",
    "oä", "OG", "OP", "P", "PC", "Part", "PDF", "Pers", "PF", "PK", "Pkt", "Pl", "Pos", "Poss", "Pron",
    "Präp", "RAM", "Rel", "resp", "RGB", "Sa", "Sachgeb", "sek", "Sep", "Sg", "Sing", "So", "SS", "SSD",
    "SST", "SSW", "St", "std", "Tab", "taus", "Tel", "trad", "Tsd", "tägl", "ua", "UG", "ugs",
    "unverh", "urspr", "usw", "uU", "uvm", "uä", "va", "verh", "verw", "vgl", "vh", "vllt", "vlt",
    "vs", "Wdh", "wg", "WK", "zb", "Ziff", "zit", "zT", "zz", "zzgl", "zzzt"
}

# List of URL suffixes
url_suffixes = {"com", "net", "org", "info", "biz", "de", "co", "uk", "us", "fr", "es", "it", "ru", "cn",
                "au", "ca", "jp", "in", "br", "za", "kr", "mx", "tech", "io", "xyz", "online", "store",
                "site", "me", "ai", "dev"}

# Regex for abbreviations
abbreviation_regex = r'\b(?:' + '|'.join(re.escape(abbrev) for abbrev in abbreviations) + r')\s*\.'

# Function to protect patterns in text
def protect_patterns(text):
    # Rule 1: Special case for "gutefrage . net" (case-insensitive)
    #text = re.sub(r'\bgutefrage\s\.\snet\b', r'gutefrage[PROTECT]net', text, flags=re.IGNORECASE)

    # Rule 2: Protect numbers in enumerations with unlimited items (1 . zuerst das 2 . dann das 3 . dann das)
    text = re.sub(r'((?:\d+\s\.\s[^\d.]+)+)\s\d+\s\.\s', r'\1[PROTECT] ', text)

    # Rule 3: Protect ordinal numbers followed by nouns (2 . Staffel, 10. Hochzeitstag)
    text = re.sub(r'(\d+)\s\.\s(\w+)', lambda m: f"{m.group(1)}[PROTECT]{m.group(2)}" if is_noun(m.group(2)) else m.group(0), text)

    # Rule 4: Protect numbers followed by closed brackets (1 . ) = enumerations)
    text = re.sub(r'(\d+)\s\.\s*\)', r'\1[PROTECT])', text)

    # Rule 5: Protect ordinal numbers followed by acronyms (3 . OG)
    text = re.sub(r'(\d+)\s\.\s([A-ZÄÖÜ]{2,})', r'\1[PROTECT]\2', text)

    # Rule 6: Protect abbreviations
    def abbreviation_handler(match):
        # Number preceding abbreviation, if any
        preceding_number = match.group(1)
        # Abbreviation (e.g., etc)
        abbreviation = match.group(2)
        # Word following the abbreviation
        next_word = match.group(3)

        # If preceded by a number, treat as non-sentence-ending (3 . Abs .)
        if preceding_number:
            return f"{preceding_number}[PROTECT]{abbreviation}[PROTECT]{next_word}"

        # If the next word is lowercase or a noun, treat as non-sentence-ending
        # (Filtern die wirklich Bakterien etc . raus oder ist das Bauernfängerrei ?)
        if next_word.islower() or is_noun(next_word):
            return f"{abbreviation}[PROTECT]{next_word}"

        # If the next word is uppercase and not a noun, treat as sentence-ending
        return f"{abbreviation}[PROTECT_END]{next_word}"

    # Regex to match numbers and abbreviations (case-insensitive)
    text = re.sub(
        r'(\d+\s\.\s)?(\b(?:' + '|'.join(re.escape(abbrev) for abbrev in abbreviations) + r'))\s\.\s(\w+)',
        abbreviation_handler,
        text,
        # Make abbreviation matching case-insensitive
        flags=re.IGNORECASE
    )

    # Rule 7: Protect standalone characters followed by periods (abbreviations that are already split up)
    text = re.sub(r'\b([a-zA-Z])\s\.', r'\1[PROTECT]', text)

    # Rule 8: Protect dynamic date formats (20 . 12 . 2020 or 20 . 12 .)
    text = re.sub(r'(\d{1,2})\s*\.\s*(\d{1,2})\s*\.\s*(\d{2,4})', r'\1[PROTECT]\2[PROTECT]\3', text)
    text = re.sub(r'(\d{1,2})\s*\.\s*(\d{4})', r'\1[PROTECT]\2', text)  # Handles "3 . 2021"

    # Rule 9: Protect ellipses (multiple periods after one another)
    text = re.sub(r'\.\s*\.\s*\.', lambda m: m.group(0).replace('.', '[PROTECT]'), text)

    # Rule 10: Protect periods before URL suffixes
    text = re.sub(
        r'\.\s*(' + '|'.join(re.escape(suffix) for suffix in url_suffixes) + r')\b',
        r'[PROTECT]\1',
        text,
        # Make matching case-insensitive
        flags=re.IGNORECASE
    )

    return text

# Apply to dataset with progress bar
data['question_text'] = data['question_text'].progress_apply(protect_patterns)

# Check-in
log_data_summary(data, "Pattern Protection")
save_sample(data, "pattern_protection")


---
* Remove protection label and replace with period to match original punctuation

---

In [None]:
# Define function
def restore_protection(text):
    # Restore protected periods with single spaces
    text = text.replace('[PROTECT]', ' . ')
    text = text.replace('[PROTECT_END]', ' . ')

    # Collapse multiple spaces into one
    return re.sub(r'\s{2,}', ' ', text)

---
* Split sentences and filter for questions to then extract questions

---

In [None]:
# Define function
def extract_questions(text):
    # Split sentences
    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(text)

    # Restore protected periods
    sentences = [restore_protection(sentence) for sentence in sentences]

    # Filter for questions
    questions = [sentence.strip() for sentence in sentences if '?' in sentence]

    return questions

---
* Assign individual identifier to each extracted question

---

In [None]:
# Define function
def assign_individual_ids(row):
    questions = extract_questions(row['question_text'])
    question_id = row['question_id']

    return [
        {
            'question_id': question_id,
            # add A, B, C ... according to number of available questions
            'question_id_individual': f"{question_id}__{chr(65 + i)}",
            'question_text': question
        }
        for i, question in enumerate(questions)
    ]

---

* Process the entire dataset
---

In [None]:
import nltk
nltk.download('punkt_tab')
from tqdm import tqdm

# Initialize tqdm for progress bar
tqdm.pandas(desc="Processing Rows")

# Extract questions with progress tracking
def process_row_with_progress(row):
    return assign_individual_ids(row)

# Process rows and extend extracted questions
extracted_questions = []
for _, row in tqdm(data.iterrows(), total=len(data), desc="Extracting Questions"):
    extracted_questions.extend(assign_individual_ids(row))

# Convert to DataFrame
extracted_df = pd.DataFrame(extracted_questions)

# Check-in
log_data_summary(extracted_df, "Question Extraction")
save_sample(extracted_df, "question_extraction")

---

* Remove duplicates one more time

---

In [None]:
# Identify duplicate rows
duplicates = extracted_df[extracted_df.duplicated(subset='question_text', keep=False)]

# Save duplicate rows to a file
duplicates.to_csv('duplicates_removed.csv', index=False, encoding='utf-8')
print(f"Duplicates saved to 'duplicates_removed.csv': {len(duplicates)} rows")

# Remove duplicate rows from the main DataFrame
extracted_df.drop_duplicates(subset='question_text', inplace=True)
extracted_df.reset_index(drop=True, inplace=True)

# Save a sample and log the summary after deduplication
save_sample(extracted_df, "after_second_deduplication")
log_data_summary(extracted_df, "Second Deduplication")

---
* Save to file after final check

---

In [None]:
# Save to CSV
extracted_df.to_csv('extracted_questions.csv', index=False, encoding='utf-8')
print("Extracted questions saved to 'extracted_questions.csv'")

---
(after manual annotation)
* Match annotated questions with their original context for subsequent factor analysis

---

In [None]:
import pandas as pd

# File paths
file_1_path = "cleaned_dataset_new.csv"
file_2_path = "representative_sample_10000(5-20).csv"
output_file_path = "representative_sample_with_context.csv"

# Load the datasets
file_1 = pd.read_csv(file_1_path, delimiter=",")
file_2 = pd.read_csv(file_2_path, delimiter=";")

# Merge on "question_id" and add the "question_text" column as "question_text_with_context"
merged = file_2.merge(
    file_1[['question_id', 'question_text']],
    on="question_id",
    how="left"
)

# Rename the column
merged.rename(columns={'question_text': 'question_text_context'}, inplace=True)

# Save the updated file
merged.to_csv(output_file_path, index=False, sep=";")
print(f"Updated file saved to '{output_file_path}'.")