In [16]:
# generate_sops.ipynb
"""
Fixed SOP generator:
 - robustly scrapes RePEc/IDEAS for top econ depts (extracts university names)
 - writes universities.xlsx with 30 university NAMES (10 from top30, 10 from 31-60, 10 from 61-90)
 - writes research_areas.xlsx (3 areas), journals.xlsx (3 per area), skills.xlsx
 - creates a docx template suitable for docxtpl and generates 90 .docx files
"""
import re
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import pandas as pd
from docx import Document as DocxDocument
from docxtpl import DocxTemplate
from docx.shared import Pt

HOME = Path.home()
OUT_DIR = HOME / "HW_School_Application"
OUT_DIR.mkdir(parents=True, exist_ok=True)

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; SOP-Generator/1.0)"}
REPEC_URL = "https://ideas.repec.org/top/top.econdept.html"

# ---------- Utilities ----------
def fetch(url, timeout=20):
    r = requests.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text

def clean_univ_text(full_text: str) -> str:
    """
    From "Department of Economics, Harvard University" -> "Harvard University"
    If no comma, return the original (e.g., "Paris School of Economics")
    Remove any trailing parentheses content.
    """
    text = full_text.strip()
    if not text:
        return text
    # split by comma, take last chunk
    if ',' in text:
        candidate = text.split(',')[-1].strip()
    else:
        candidate = text
    # remove parentheses and content inside, e.g. (USA) or (MIT)
    candidate = re.sub(r"\s*\(.*?\)\s*", "", candidate)
    # normalize whitespace
    candidate = " ".join(candidate.split())
    return candidate

# ---------- Scrape RePEc / IDEAS ----------
def scrape_repec_universities(limit=90):
    html = fetch(REPEC_URL)
    soup = BeautifulSoup(html, "html.parser")
    # find all anchors pointing to edirc.repec.org (these are the institution links in ranking order)
    anchors = soup.find_all('a', href=lambda h: h and 'edirc.repec.org' in h)
    names = []
    for a in anchors:
        text = a.get_text(separator=" ", strip=True)
        if not text:
            continue
        # Filter: accept entries that contain 'Econom' or 'Department' or 'School' or 'Faculty' or 'Institute' or 'Department of' 
        # (this reduces unrelated links)
        if re.search(r'\b(Econom|Department|School|Faculty|Institute|Center|Centre|Dipartimento|Facult)', text, re.I):
            names.append(text)
    # dedupe while preserving order
    cleaned = []
    for n in names:
        if n not in cleaned:
            cleaned.append(n)
        if len(cleaned) >= limit:
            break
    # If not enough entries found, fall back to plain anchor texts (less filtered)
    if len(cleaned) < limit:
        for a in anchors:
            t = a.get_text(separator=" ", strip=True)
            if t and t not in cleaned:
                cleaned.append(t)
            if len(cleaned) >= limit:
                break
    return cleaned[:limit]

# ---------- Create Excel lists ----------
def make_university_list_from_repec():
    raw_top90 = scrape_repec_universities(90)
    if len(raw_top90) < 90:
        print("Warning: less than 90 items scraped from IDEAS; got", len(raw_top90))
    # extract university names
    cleaned_univs = [clean_univ_text(x) for x in raw_top90]
    # Now select 10 from top 30 (indices 0..29), 10 from 31..60 (30..59), 10 from 61..90 (60..89)
    def pick_section(lst, start, end, count):
        section = lst[start:end]
        if len(section) >= count:
            return section[:count]
        # fallback pad
        out = list(section)
        i = 0
        while len(out) < count and i < len(lst):
            out.append(lst[i % len(lst)])
            i += 1
        return out[:count]
    top30_10 = pick_section(cleaned_univs, 0, 30, 10)
    top60_10 = pick_section(cleaned_univs, 30, 60, 10)
    top90_10 = pick_section(cleaned_univs, 60, 90, 10)
    universities_30 = top30_10 + top60_10 + top90_10
    # final dedupe preserve order
    final = []
    for u in universities_30:
        if u not in final:
            final.append(u)
    # write to excel (column 'university')
    df_uni = pd.DataFrame({"university": final})
    df_uni.to_excel(OUT_DIR / "universities.xlsx", index=False)
    print("Wrote universities.xlsx with", len(final), "rows. Sample:")
    print(df_uni.head(15).to_string(index=False))
    return final

# ---------- Research areas, journals, skills (step4-6) ----------
# Per assignment pick 3 research areas from SCMOR categories: use these 3 by default
RESEARCH_AREAS = ["Economics", "Finance", "Information Management"]
TOP_JOURNALS = {
    "Economics": ["American Economic Review", "Econometrica", "Quarterly Journal of Economics"],
    "Finance": ["Journal of Finance", "Review of Financial Studies", "Journal of Financial Economics"],
    "Information Management": ["MIS Quarterly", "Information Systems Research", "Journal of Management Information Systems"]
}
SKILLS = ["C/C++", "Python", "R", "SQL", "LaTeX", "Machine Learning", "Mathematics", "PowerBI", "Tableau"]

def write_aux_excels(research_areas, journals_map, skills_list):
    pd.DataFrame({"research_area": research_areas}).to_excel(OUT_DIR / "research_areas.xlsx", index=False)
    rows = []
    for ra in research_areas:
        js = journals_map.get(ra, [])[:3]
        rows.append({"research_area": ra, "journal1": js[0] if len(js)>0 else "",
                     "journal2": js[1] if len(js)>1 else "", "journal3": js[2] if len(js)>2 else ""})
    pd.DataFrame(rows).to_excel(OUT_DIR / "journals.xlsx", index=False)
    pd.DataFrame({"skill": skills_list}).to_excel(OUT_DIR / "skills.xlsx", index=False)
    print("Wrote research_areas.xlsx, journals.xlsx, skills.xlsx")

# ---------- Create docx template (docxtpl-friendly) ----------
TEMPLATE_DOCX = OUT_DIR / "sop_template.docx"

def create_docx_template(path=TEMPLATE_DOCX):
    doc = DocxDocument()
    style = doc.styles['Normal']
    style.font.name = 'Times New Roman'
    style.font.size = Pt(12)

    # Keep each paragraph as a single string containing any placeholders
    doc.add_paragraph("Dear Admission Committee,")
    doc.add_paragraph("")
    doc.add_paragraph("My name is {{ name }}, and I am pleased to apply for the {{ program }} at {{ university }}.")
    doc.add_paragraph("")
    doc.add_paragraph(
        "In my free time, I enjoy reading top-tier academic research to stay updated with the latest advancements in {{ research_area }}. "
        "I occasionally study articles from leading journals such as {{ journals|join(', ') }}. "
        "This habit deepens my understanding of theoretical and empirical approaches and sharpens my critical analysis skills."
    )
    doc.add_paragraph("")
    doc.add_paragraph(
        "My objective is to pursue research in {{ research_area }}. "
        "To achieve this goal, I have developed practical skills including: {{ skills|join(', ') }}."
    )
    doc.add_paragraph("")
    doc.add_paragraph(
        "I am particularly drawn to {{ university }} because of its strong academic environment and research-oriented approach in {{ research_area }}. "
        "I believe the program will allow me to grow as a researcher and contribute to the department."
    )
    doc.add_paragraph("")
    doc.add_paragraph(
        "Thank you for considering my application. "
        "I am eager to contribute to and benefit from the rigorous academic culture at {{ university }}."
    )
    doc.add_paragraph("")
    doc.add_paragraph("Sincerely,")
    doc.add_paragraph("")
    doc.add_paragraph("{{ name }}")

    doc.save(path)
    print("Saved template:", path)

# ---------- Generate documents using docxtpl ----------
def generate_documents(universities, research_areas, journals_map, skills_list, applicant_info):
    out = []
    for uni in universities:
        for ra in research_areas:
            # load a fresh template every iteration to avoid side-effects
            tpl = DocxTemplate(TEMPLATE_DOCX)
            ctx = {
                "name": applicant_info.get("name", "Xinyu Liu"),
                "program": applicant_info.get("program", "Master program"),
                "university": uni,
                "research_area": ra,
                "journals": journals_map.get(ra, [])[:3],
                "skills": skills_list
            }
            fname_safe = "".join(c for c in uni if c.isalnum() or c in " _-").strip()[:50]
            filename = f"SOP_{fname_safe}_{ra.replace(' ', '_')}.docx"
            out_path = OUT_DIR / filename
            tpl.render(ctx)
            tpl.save(out_path)
            out.append(out_path)
    print("Generated", len(out), "docx files in", OUT_DIR)
    return out

# ---------- Main ----------
def main():
    # 1) universities
    universities_30 = make_university_list_from_repec()

    # 2) research areas + journals + skills
    write_aux_excels(RESEARCH_AREAS, TOP_JOURNALS, SKILLS)

    # 3) create template (overwrite always to ensure correctness)
    create_docx_template(TEMPLATE_DOCX)

    # 4) generate 90 docs
    applicant = {"name": "Xinyu Liu", "program": "Master of Finance"}
    all_docs = generate_documents(universities_30, RESEARCH_AREAS, TOP_JOURNALS, SKILLS, applicant)

    # 5) write a README summary
    (OUT_DIR / "README.txt").write_text(
        "This folder was generated by generate_sops.ipynb\n"
        "Contains:\n - universities.xlsx (30 universities)\n - research_areas.xlsx, journals.xlsx, skills.xlsx\n - sop_template.docx\n - {} generated .docx files\n".format(len(all_docs))
    )
    print("Done. Check", OUT_DIR)

if __name__ == "__main__":
    main()


Wrote universities.xlsx with 30 rows. Sample:
                           university
                   Harvard University
Massachusetts Institute of Technology
    University of California-Berkeley
                University of Chicago
            Paris School of Economics
                 Princeton University
                  Stanford University
                      Yale University
         Toulouse School of Economics
                    Oxford University
              University of Cambridge
                       Boston College
                University of Toronto
                University of Warwick
                   Cornell University
Wrote research_areas.xlsx, journals.xlsx, skills.xlsx
Saved template: /Users/rhae/HW_School_Application/sop_template.docx
Generated 90 docx files in /Users/rhae/HW_School_Application
Done. Check /Users/rhae/HW_School_Application
