In [None]:
import pandas as pd
import numpy as np
import pdfplumber
import re
import glob
import unicodedata
import os
from tqdm import tqdm

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
def slugify(text: str) -> str:
    text = unicodedata.normalize('NFKD', text)
    text = ''.join(c for c in text if not unicodedata.combining(c))
    text = text.lower()
    text = re.sub(r'[^a-z0-9]+', '_', text)
    return re.sub(r'_+', '_', text).strip('_')

def normalize_question(q: str) -> str:
    """
    Normaliza preguntas que varían por contenidos entre paréntesis u otras variables,
    reemplazando patrones específicos por una versión genérica.
    """
    # Patrón para la pregunta de ciudadanía con country variable
    citizen_pat = re.compile(
        r"^Is this applicant a citizen of the selected country of passport.*\?$", 
        re.IGNORECASE
    )
    if citizen_pat.match(q):
        return "Is this applicant a citizen of the selected country of passport?"
    # Añade otros patrones de normalización aquí si es necesario
    return q

def extract_questions_answers(pdf_path: str) -> list:
    """
    Extrae lista de (pregunta_normalizada, respuesta) del PDF,
    uniendo fragmentos multi‑línea y saltándose secciones y campos inline.
    """
    section_re = re.compile(r'^[A-Z][A-Za-z ]+$')
    inline_re  = re.compile(r'^(.+?):')
    
    # 1) Leer todas las líneas
    with pdfplumber.open(pdf_path) as pdf:
        lines = []
        for p in pdf.pages:
            lines.extend((p.extract_text() or '').split('\n'))
    lines = [l.strip() for l in lines if l.strip()]
    
    qa = []
    i = 0
    while i < len(lines):
        # 2) Detectar fin de pregunta
        if lines[i].endswith('?'):
            # 3) Retroceder para juntar cualquier fragmento previo
            j = i - 1
            question_fragments = [lines[i]]
            while j >= 0:
                prev = lines[j]
                # si es inline o sección o termina en '?' dejamos de retroceder
                if inline_re.match(prev) or section_re.match(prev) or prev.endswith('?'):
                    break
                # si no, es parte de la pregunta -> insertarlo al inicio
                question_fragments.insert(0, prev)
                j -= 1
            
            full_q = " ".join(question_fragments)
            # 4) Buscar la respuesta (primera línea no‑pregunta tras el '?')
            k = i + 1
            while k < len(lines) and lines[k].endswith('?'):
                k += 1
            answer = lines[k] if k < len(lines) else ""
            
            # 5) Normalizar y guardar
            norm_q = normalize_question(full_q)
            qa.append((norm_q, answer))
            
            # 6) Saltar past la respuesta
            i = k + 1
            continue
        
        i += 1
    
    return qa

def extract_fields_with_context(pdf_path: str) -> dict:
    data = {}
    current_section = None
    section_pattern = re.compile(r'^[A-Z][A-Za-z ]+$')
    inline_pattern  = re.compile(r'^(.+?):\s*(.+)$')

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for line in page.extract_text().split('\n'):
                line = line.strip()
                if not line:
                    continue
                if section_pattern.match(line) and ':' not in line and not line.endswith('?'):
                    current_section = slugify(line)
                    continue
                m = inline_pattern.match(line)
                if m:
                    field, val = m.groups()
                    key = slugify(field)
                    if current_section:
                        key = f"{current_section}_{key}"
                    data[key] = val.strip()
    return data

In [None]:
def process_application_pdfs(root_folder: str, question_map=None):
    """
    Procesa subcarpetas numeradas en root_folder.
    - question_map: dict existente {idx: pregunta}. Si es None, se inicia vacío.
    Devuelve (DataFrame, updated_question_map).
    """
    if question_map is None:
        question_map = {}
    rows = []

    items_list = [d for d in set(os.listdir(root_folder)) if d != '.DS_Store']

    for sample_dir in tqdm(sorted(items_list, key=lambda x: int(x) if x.isdigit() else x)):
        folder = os.path.join(root_folder, sample_dir)
        if not os.path.isdir(folder): continue
        # renombrar archivos a minúsculas
        for fname in os.listdir(folder):
            old, new = os.path.join(folder,fname), os.path.join(folder,fname.lower())
            if old != new: os.rename(old,new)
        # buscar PDF
        cands = [f for f in os.listdir(folder) if 'application' in f and f.endswith('.pdf')]
        if not cands: continue
        pdf_path = os.path.join(folder, cands[0])
        inline_data = extract_fields_with_context(pdf_path)
        qa_list = extract_questions_answers(pdf_path)
        # actualizar mapping
        for q, _ in qa_list:
            if q not in question_map.values():
                new_idx = max(question_map.keys(), default=0) + 1
                question_map[new_idx] = q
        # construir fila
        row = dict(inline_data)
        row['sample_id'] = sample_dir
        row['source_file'] = os.path.basename(pdf_path)
        # inicializar columnas de respuestas
        for idx in question_map:
            row[f'question_{idx}'] = None
        # asignar respuestas
        for idx, q_text in question_map.items():
            for q, a in qa_list:
                if q == q_text:
                    row[f'question_{idx}'] = a
                    break
        rows.append(row)
    df = pd.DataFrame(rows)
    return df, question_map


In [None]:
# Ruta raíz donde están las carpetas numeradas

df_all_granted, question_map_g = process_application_pdfs('data/granted/')
df_all_granted['final_visa_status'] = 'granted'

print(len(question_map_g))

df_all_refused, question_map_r = process_application_pdfs('data/refused/', question_map = question_map_g)
df_all_refused['final_visa_status'] = 'refused'

In [None]:
len(question_map_r)

In [None]:
question_map_r[69]

In [None]:
df_all_refused.shape#[['question_69','question_70','question_80','question_81']]

In [None]:
df_all = pd.concat([df_all_granted,df_all_refused])

In [None]:
df_all