In [33]:
from pdfminer.high_level import extract_text
from pathlib import Path
import re

In [34]:
PDF_FILE = Path("../data/amplitud.pdf")

In [35]:
text = extract_text(PDF_FILE)
# Remove blank lines
text = "\n".join([line for line in text.split("\n") if line.strip() != ""])
# Remove lines with only a number
text = "\n".join([line for line in text.split("\n") if not line.strip().isnumeric()])
# Remove lines after "Notas"
lines = []
for line in text.split("\n")[2:]:
    if "notas" in line.strip().lower() or "(*) " in line.strip():
        break
    lines.append(line)
    
# Remove "<number> cr" (EOL) lines
course_pattern = re.compile(r"^\d+ cr$")
lines = [line for line in lines if not course_pattern.match(line.strip())]

# Remove lines with more than 10 words
# lines = [line for line in lines if len(line.split()) < 10]

# If format is "ICS1323" (three letters followed by four numbers), strip the rest of the line
course_pattern = re.compile(r"^[A-Z]{3}\d{3,}$")
lines = [line if course_pattern.match(line.strip()) is None else line.strip()[:8] for line in lines]

In [36]:
len(lines)

521

In [37]:
print("\n".join(lines))
# Write to file
with open("../data/amplitud.txt", "w") as f:
    f.write("\n".join(lines))

Departamento de Ingeniería Hidráulica y Ambiental
(N338) Ingeniería Hidráulica y Ambiental Vs.01 (Desde admisión 2020)
19.1 Mínimos
ICH1104
ICH2204
ICH2114
ICH2304
ICH2314
Mecánica de Fluidos
Hidrología
Ingeniería Hidráulica
Ingeniería Ambiental
Calidad del Agua
19.3 Optativos Complementarios (b)
ICM2223
IIQ2363
ICH2604
ICH2613
ICE2623
GEO202
Transferencia de Calor
Residuos Sólidos y Peligrosos
Principios de tratamiento de aguas
CMD Cambio Climático: un Enfoque Multidisciplinario 
Introducción a la Geología Física
Sistemas de Información Geográfica 
Departamento de Ciencias de la Computación
(N776) Programación Vs.01
19.1 Mínimos
IIC1253
IIC2133
IIC2143
IIC2233
IIC2413
Matemáticas Discretas
Estructuras de Datos y Algoritmos
Ingeniería de Software
Programación Avanzada
Bases de Datos
19.3 Optativos Complementarios (b)
IIC2343
IIC2713
IIC2513
IIC2613
IIC2333
IIC2283
Arquitectura de Computadores
Sistemas de Información
Tecnologías y Aplicaciones Web
Inteligencia Artificial
Sistemas Operat

In [38]:
import json


minors: dict[str, dict[str, list[str]]] = {}
# Minor -> Req. Category -> (opt.) Choose Between -> Courses

# Minors titles are: "(<letter><numbers>) <title>")
minor_pattern = re.compile(r"^\([A-Z]\d+\) .+$")
# Requirement categories are: "<number><number>.<number> <title>"
category_pattern = re.compile(r"^\d+\.\d+ .+$")
# Courses are: "<letter><letter><letter><numbers>"
course_pattern = re.compile(r"^[A-Z]{3}\d(\d|[A-Z]){2,}$")
# Choose between are: "Elegir entre*"
choose_between_pattern = re.compile(r"^Elegir .+$")

# Build the minors dictionary
current_minor = None
current_category = None
reading_courses = False
for i,line in enumerate(lines):
    line = line.strip()
    if minor_pattern.match(line):
        reading_courses = False
        current_minor = line
        current_category = None
        minors[current_minor] = {}
    elif category_pattern.match(line):
        current_category = line
        reading_courses = True
        # Check for a choose between
        if choose_between_pattern.match(lines[i+1]):
            current_category += " " + lines[i+1]
        minors[current_minor][current_category] = [] # type: ignore
    elif course_pattern.match(line):
        reading_courses = True
        if current_category is None:
            current_category = "Sin categoría"
            minors[current_minor][current_category] = [] # type: ignore
        minors[current_minor][current_category].append(line) # type: ignore
    elif choose_between_pattern.match(line):
        reading_courses = True
        continue
    else:
        if reading_courses and "Innova" in current_minor:
            minors[current_minor][current_category].append(line) # type: ignore
    
Path("../data/minors.json").write_text(json.dumps(minors, indent=4, ensure_ascii=False))

minors

{'(N338) Ingeniería Hidráulica y Ambiental Vs.01 (Desde admisión 2020)': {'19.1 Mínimos': ['ICH1104',
   'ICH2204',
   'ICH2114',
   'ICH2304',
   'ICH2314'],
  '19.3 Optativos Complementarios (b)': ['ICM2223',
   'IIQ2363',
   'ICH2604',
   'ICH2613',
   'ICE2623',
   'GEO202']},
 '(N776) Programación Vs.01': {'19.1 Mínimos': ['IIC1253',
   'IIC2133',
   'IIC2143',
   'IIC2233',
   'IIC2413'],
  '19.3 Optativos Complementarios (b)': ['IIC2343',
   'IIC2713',
   'IIC2513',
   'IIC2613',
   'IIC2333',
   'IIC2283']},
 '(N207) Tecnologías de la Información Vs.01': {'19.1 Mínimo': ['IIC2233',
   'IIC2413',
   'IIC2733',
   'IIC2764',
   'IIC2713'],
  '19.3 Optativos Complementarios (b)': ['IIC2513',
   'IIC2433',
   'IIC2613',
   'IIC2985',
   'IIC2986',
   'IIC2987']},
 '(N346) Ingeniería Estructural Vs.02 (Desde admisión 2020)': {'19.1 Mínimos': ['ICE2114',
   'ICE2313',
   'ICE2006',
   'ICC2105',
   'ICE2413',
   'ICE2533',
   'ICC2454'],
  '19.3 Optativos Complementarios (b)': ['ICE2