In [None]:
# Imports
import os
import os.path as op
import re
import pandas as pd

# Constants
RAW_DATA_DIR = "raw_data"
OUTPUT_DATA_DIR = "processed_data"

# If the last part of the path of the current working directory is "data_process_notebooks", change to the parent directory (the main one for all data-related)
current_dir = os.getcwd()
if op.basename(current_dir) == "data_processing_notebooks":
    os.chdir("..")
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\sande\OneDrive\Bureau\UofT\CSC2701_Communication4CS\internship-ai-assisstant\server\data


## Part 1 : PDFs

Steps:
- Extract all info from the PDFs
- Clean the text
- Merging individual pages and chunk intelligently

In [3]:
# Imports for this part
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from collections import defaultdict

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Get a list of all pdf document names in the ProjData folder
pdf_filenames = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith(".pdf")]
# For storing loaded documents
docs = []
# Extraction
for pdf_filename in pdf_filenames:
    pdf_path = op.join("raw_data", pdf_filename)
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    docs.extend(documents)

Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 116 0 (offset 0)
Ignoring wrong pointing object 130 0 (offset 0)
Ignoring wrong pointing object 179 0 (offset 0)
Ignoring wrong pointing object 181 0 (offset 0)
Ignoring wrong pointing object 183 0 (offset 0)
Ignoring wrong pointing object 186 0 (offset 0)
Ignoring wrong pointing object 188 0 (offset 0)
Ignoring wrong pointing object 190 0 (offset 0)
Ignoring wrong pointing object 193 0 (offset 0)
Ignoring wrong pointing object 195 0 (offset 0)
Ignoring wrong pointing object 197 0 (offset 0)
Ignoring wrong pointing object 478 0 (offset 0)
Ignoring wrong pointing object 479 0 (offset 0)


In [5]:
# Helper for text cleanup
def clean_text(text):
    # Remove multiple newlines
    text = re.sub(r'\n+', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove hyphenation at line breaks (e.g. "inter-\nview" → "interview")
    text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text)
    # Strip weird bullet points or artifacts
    text = re.sub(r'•|▪|●|–|­', '', text)
    # Normalize whitespace and punctuation even more
    text = re.sub(r'\s([?.!,:;])', r'\1', text)
    # Trim spaces
    return text.strip()

# Clean up the text in each document
for doc in docs:
    doc.page_content = clean_text(doc.page_content)

In [6]:
# Merge all text per PDF
pdf_texts = defaultdict(str)
for doc in docs:
    pdf_texts[doc.metadata["source"]] += " " + doc.page_content

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
clean_pdf_docs = []
for pdf_name, text in pdf_texts.items():
    for chunk in splitter.create_documents([text]):
        chunk.metadata["source"] = pdf_name
        clean_pdf_docs.append(chunk)

In [7]:
# Inspection
for d in clean_pdf_docs[:3]:
    print(d.metadata)
    print(d.page_content)
    print("—" * 50)

{'source': 'raw_data\\Luki_Lec1.pdf'}
2701: Section 5301 Elevate your Interviewing and networking Benchmark and Level set 1 Benchmark and Level set 2 6:00 pm Introductions, Foundations for interview and networking + Benchmark 7:00 pm BREAK 7:15 pm LinkedIn +Networking 8:00 pm BREAK 8:15 pm Interview preparation 9:00 pm Thank you! Objectives: After today’s session you should  Understand how to land an internship  Have strong foundations for interviews and networking  Have a better understanding of your interview and networking
——————————————————————————————————————————————————
{'source': 'raw_data\\Luki_Lec1.pdf'}
for interviews and networking  Have a better understanding of your interview and networking levels and areas to improve Luki Danukarjanto 3  Career Coach and Educator  Former Computer Scientist turned Management Consultant (+ campus recruiting lead) turned Educator/Entrepreneur  Author and Podcast/YouTube host “SIWIKE: Stuff I Wish I Knew Earlier” This is a safe space! Plus a 

## Part 2 : Powerpoint presentations

In [8]:
# Imports
from pptx import Presentation

In [9]:
# Get a list of all ppt document names in the ProjData folder
ppt_filenames = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith((".ppt", ".pptx"))]
# Helper for extraction
def extract_ppt_text(path):
    prs = Presentation(path)
    slides_text = []
    
    for i, slide in enumerate(prs.slides, start=1):
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                slide_text.append(shape.text.strip())
        text = " ".join(slide_text)
        slides_text.append({
            "id": f"{os.path.basename(path)}_slide_{i}",
            "text": text,
            "metadata": {
                "source": os.path.basename(path),
                "slide_number": i,
                "type": "ppt"
            }
        })
    return slides_text
# Extraction
ppt_docs = []
for ppt_filename in ppt_filenames:
    ppt_path = op.join(RAW_DATA_DIR, ppt_filename)
    slides = extract_ppt_text(ppt_path)
    for slide in slides:
        ppt_docs.append(slide)


In [10]:
# Clean the text in each ppt document
for doc in ppt_docs:
    doc["text"] = clean_text(doc["text"])

In [11]:
# Rechunking
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
clean_ppt_docs = []
for d in ppt_docs:
    chunks = splitter.split_text(d["text"])
    for i, chunk in enumerate(chunks):
        clean_ppt_docs.append({
            "id": f'{d["id"]}_chunk_{i}',
            "text": chunk,
            "metadata": d["metadata"]
        })

In [12]:
# Inspection
for doc in clean_ppt_docs[:3]:
    print(doc["metadata"])
    print(doc["text"][:200])
    print("-"*50)

{'source': 'MScAC_JobFindingStrategies.pptx', 'slide_number': 1, 'type': 'ppt'}
Job Finding Strategies Presented by: Murtuza Rajkotwala Research and Business Development Officer September 25th, 2025 1
--------------------------------------------------
{'source': 'MScAC_JobFindingStrategies.pptx', 'slide_number': 2, 'type': 'ppt'}
Introduction Welcome to MScAC! 2
--------------------------------------------------
{'source': 'MScAC_JobFindingStrategies.pptx', 'slide_number': 3, 'type': 'ppt'}
Agenda Reminder Requirements for Internships Finding jobs through LinkedIn External Job Boards ATS and External Job Boards Web Scraping Project Tips and Tricks Workshop
--------------------------------------------------


## Part 3 : Word documents

In [13]:
# Imports
from docx import Document

In [14]:
# Get a list of all word document names in the ProjData folder
docx_filenames = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith((".docx"))]
# Extraction helper
def extract_docx_text(path):
    doc = Document(path)
    full_text = []
    for para in doc.paragraphs:
        if para.text.strip():
            full_text.append(para.text.strip())
    return " ".join(full_text)
# Extract
docx_docs = []
for docx_filename in docx_filenames:
    text = extract_docx_text(op.join(RAW_DATA_DIR, docx_filename))
    docx_docs.append({
        "id": docx_filename,
        "text": text,
        "metadata": {
            "source": docx_filename,
            "type": "word"
        }
    })

In [15]:
# Clean up the text in each document
for doc in docx_docs:
    doc["text"] = clean_text(doc["text"])

In [16]:
# Rechunk
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

clean_docx_docs = []
for doc in docx_docs:
    chunks = splitter.split_text(doc["text"])
    for i, chunk in enumerate(chunks):
        clean_docx_docs.append({
            "id": f"{doc['id']}_chunk_{i}",
            "text": chunk,
            "metadata": doc["metadata"]
        })

In [17]:
# Inspection
for d in clean_docx_docs[:3]:
    print(d["metadata"])
    print(d["text"][:200])
    print("-"*50)

{'source': 'MScAC_InternshipProposalForm.docx', 'type': 'word'}
Master of Science in Applied Computing (MScAC) Program: MScAC Internship Project Proposal Form 1. Proposing Organization Include the name of the organization, size of the organization, location of the
--------------------------------------------------
{'source': 'MScAC_InternshipProposalForm.docx', 'type': 'word'}
of the main business of the organization. Enter your text here. 4. Problem Statement (about 100 words) State the main challenge the internship is designed to address. Explain why this problem is of re
--------------------------------------------------
{'source': 'MScAC_InternshipProposalForm.docx', 'type': 'word'}
hoping for during the internship. What does success look like? If appropriate highlight how this success will impact your organization. Enter your text here. 6. Methodology (about 200 words) Explain t
--------------------------------------------------


## Part 4 : MScAC calendar

In [18]:
# Imports
from ics import Calendar

In [19]:
# Get all ICS filenames
ics_filenames = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith((".ics"))]
# Little check to make sure there is only one calendar, if not warn user there must only be one
if len(ics_filenames) > 1:
    print("Warning: More than one .ics file found. Please ensure there is only one calendar file. Here we only consider the 1st one.")
# Load the calendar
with open(op.join(RAW_DATA_DIR, ics_filenames[0]), "r") as f:
    calendar = Calendar(f.read())

In [20]:
# Get events
events = []
for i, event in enumerate(calendar.events):
    events.append({
        "id": f"event_{i}",
        "title": event.name,
        "start": event.begin.format('YYYY-MM-DD HH:mm'),
        "end": event.end.format('YYYY-MM-DD HH:mm'),
        "location": event.location if event.location else "",
        "description": event.description if event.description else ""
    })
# Convert to text chunks for RAG
for event in events:
    text = f"Event: {event['title']}\n"
    text += f"Date & Time: {event['start']} to {event['end']}\n"
    if event['location']:
        text += f"Location: {event['location']}\n"
    if event['description']:
        text += f"Description: {event['description']}\n"
    event["text"] = text

In [21]:
# Group events per week to create chunks
weekly_chunks = defaultdict(list)

for event in events:
    week = pd.to_datetime(event['start']).strftime("%Y-W%U")
    weekly_chunks[week].append(event["text"])

calendar_info = []
for week, texts in weekly_chunks.items():
    calendar_info.append({
        "id": f"events_{week}",
        "text": "\n---\n".join(texts),
        "metadata": {"week": week, "type": "calendar"}
    })

In [22]:
# Inspection
for d in calendar_info[:3]:
    print(d["metadata"])
    print(d["text"][:500])
    print("-"*50)

{'week': '2025-W41', 'type': 'calendar'}
Event: Movie Night Fridays [CSC2703H Y LEC5101]
Date & Time: 2025-10-17 21:00 to 2025-10-18 00:00

---
Event:  Eva and Allen Lau Commercialization Catalyst Prize for Computing & Engineering Innovation [CSC2703H Y LEC5101]
Date & Time: 2025-10-15 22:00 to 2025-10-16 00:00
Location: TBC
Description: Registration link: https://my.alumni.utoronto.ca/s/731/form-blank/index.aspx?sid=731&gid=1&pgid=25838&content_id=25265

---
Event: Industry Partner Afternoons: AMD [CSC2703H Y LEC5101]
Date & Time: 202
--------------------------------------------------
{'week': '2025-W45', 'type': 'calendar'}
Event: Movie Night Fridays [CSC2703H Y LEC5101]
Date & Time: 2025-11-14 22:00 to 2025-11-15 01:00

---
Event: Industry Partner Afternoon: ModiFace [CSC2703H Y LEC5101]
Date & Time: 2025-11-14 19:30 to 2025-11-14 22:00
Location: 9014/9016

---
Event: MScACtalks: Yonatahn Kahn [CSC2703H Y LEC5101]
Date & Time: 2025-11-11 16:00 to 2025-11-11 17:00

---
Event: ARIA [C

## Part 5 : Webpages

In [23]:
# Imports
import requests
from bs4 import BeautifulSoup

In [24]:
# Get data from URLs
URLs = [
    "https://www.cs.toronto.edu/dcs/documents/mscac/partner-guidelines/"
]
webpage_info = []
for url in URLs:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Remove scripts and styles
    for tag in soup(["script", "style", "header", "footer", "nav"]):
        tag.decompose()

    # Get the main text
    text = " ".join(soup.stripped_strings)
    # Clean the text
    text = clean_text(text)
    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        webpage_info.append({
            "id": f"{re.sub(r'\\W+', '_', url)}_chunk_{i}",
            "text": chunk,
            "metadata": {
                "source": url,
                "type": "webpage"
            }
        })

In [25]:
# Inspection
for d in webpage_info[:3]:
    print(d["metadata"])
    print(d["text"][:500])
    print("-"*50)

{'source': 'https://www.cs.toronto.edu/dcs/documents/mscac/partner-guidelines/', 'type': 'webpage'}
Recruiting Guidelines and Information for Partners | MScAC | University of Toronto Recruiting Guidelines and Information for Partners Welcome to MScAC Each graduate student in the Master of Science in Applied Computing program (MScAC) takes part in an 8-month internship (May to December) that must include a substantial applied research component. The MScAC graduate program imposes a higher standard of creative or intellectual exploration than would normally be encountered in a co-op study
--------------------------------------------------
{'source': 'https://www.cs.toronto.edu/dcs/documents/mscac/partner-guidelines/', 'type': 'webpage'}
of creative or intellectual exploration than would normally be encountered in a co-op study program or a paid work experience program. Internship opportunities are usually paid and must be formally vetted by the MScAC program prior to being promoted to MS

## Part 6 : Additional MScAC-recommended web pages and resources 

In [26]:
# Build the metadata dictionary ourselves
external_resources = [
    {
        "id": "mscac_job_boards",
        "type": "reference",
        "text": (
            "The MScAC team recommends checking out the following external job posting websites (c.f. the \"Internship and Academic Supervisor Information\" page of the CSC2703 page on Quercus for more details): "
            "U of T Entrepreneurship job board, Vector Digital Talent Hub, Machine Learning Techniques Jobs, "
            "MaRS Discovery District Job Board, Communitech Work In Tech, Toronto AI Job Postings, You're Next Career Network."
        ),
        "metadata": {"category": "job_postings"}
    },
    {
        "id": "mscac_interview_training",
        "type": "reference",
        "text": (
            "For interview training, the MScAC team recommends checking out (c.f. the \"Internship and Academic Supervisor Information\" page of the CSC2703 page on Quercus for more details): "
            "Top Interview Tips video, Career Prep Series – TalentBoard, 9 Tips For How To Make Interview Small Talk - Zippia."
        ),
        "metadata": {"category": "interview_training"}
    },
    {
        "id": "mscac_technical_interviews",
        "type": "reference",
        "text": (
            "For technical interview preparation, the MScAC team recommends checking out (c.f. the \"Internship and Academic Supervisor Information\" page of the CSC2703 page on Quercus for more details): "
        ),
        "metadata": {"category": "technical_interview"}
    },
    {
        "id": "mscac_online_presence",
        "type": "reference",
        "text": (
            "For building your online presence, the MScAC team recommends checking out (c.f. the \"Internship and Academic Supervisor Information\" page of the CSC2703 page on Quercus for more details): "
            "Accelerating Your Career Via Your Online Presence video."
        ),
        "metadata": {"category": "career_development"}
    }
]

# Add the URLs for the resources
## Job boards
links0 = ["https://jobs.entrepreneurs.utoronto.ca/jobs",
          "https://talenthub.vectorinstitute.ai/",
          "https://mltechniques.com/jobs/",
          "https://techjobs.marsdd.com/jobs",
          "https://www1.communitech.ca/companies",
          "https://torontoai.org/job-postings/",
          "https://www.yourenext.ca/"
          ]
external_resources[0]["text"] += " Links: " + ", ".join(links0)
## Interview training
links1 = [
    "https://www.youtube.com/watch?v=HG68Ymazo18", 
    "https://talentboard.io/prep/",
    "https://www.zippia.com/advice/9-tips-for-interview-small-talk/"
]
external_resources[1]["text"] += " Links: " + ", ".join(links1)
## Technical interviews
page_names = ["David Stutz blog", "LeetCode", "InterviewBit", "Acing Data & AI Interviews video", "Programming Interviews Exposed book", 
            "Cornell interviews page", "HackerRank", "Firecode", "Gayle Laakmann McDowell resources", "Python Tutor", "Pramp", 
            "Vault", "Simplilearn", "Springboard", "Top Machine Learning Interview Questions - InterviewBit", "MLQuestions GitHub repository"]
links2 = [
    "https://davidstutz.de/how-i-prepared-for-deepmind-and-google-ai-research-internship-interviews-in-2019/",
    "https://leetcode.com/",
    "https://www.interviewbit.com/big-data-interview-questions/",
    "https://www.youtube.com/watch?v=Zlefd0fl7iQ",
    "https://ebookcentral-proquest-com.myaccess.library.utoronto.ca/lib/utorontocc-ebooks/detail.action?docID=5333089",
    "https://www.cs.cornell.edu/~xanda/interviews.html",
    "https://www.hackerrank.com/",
    "https://firecode.io/",
    "https://www.gayle.com/consulting",
    "https://pythontutor.com/",
    "https://www.pramp.com/#/",
    "https://www.thebalancemoney.com/top-technical-interview-questions-2061227#toc-top-50-technical-interview-questions",
    "https://vault.com/blogs/interviewing/29-technical-interview-questions-top-tech-firms-ask-internship-candidates",
    "https://www.simplilearn.com/tutorials/data-science-tutorial/data-science-interview-questions",
    "https://www.springboard.com/blog/data-science/machine-learning-interview-questions/",
    "https://www.interviewbit.com/machine-learning-interview-questions/",
    "https://github.com/andrewekhalel/MLQuestions",
]
for i, (page_name, link) in enumerate(zip(page_names, links2)):
    if i % 5 == 0 and i != 0:
        # Every 5 resources, start a new section
        external_resources[2]["text"] += f"Technical interview external resources #{i//5}: "
    external_resources[2]["text"] += f" {page_name}: {link},"
## Online presence
links3 = [
    "https://www.youtube.com/watch?v=MOn2Rsd1Oro"
]
external_resources[3]["text"] += " Link: " + ", ".join(links3)

## Part 7 : Putting everything together

The resources we have are the following: 
- PDFs : in clean_pdf_docs
- PPTs : in clean_ppt_docs
- docxs : in clean_docx_docs
- calendar : in calendar_info
- webpages : in webpage_info
- additional MScAC resources : in external_resources

#### 7.1 Putting everything into a single list

In [27]:
rag_docs = []

def add_to_rag(docs, dtype):
    for i, d in enumerate(docs):
        metadata = {}
        if isinstance(d, dict):
            metadata.update(d.get("metadata", {}))
            metadata["source"] = d.get("source", None)
        metadata["type"] = dtype

        text = d["text"] if isinstance(d, dict) and "text" in d else str(d)

        rag_docs.append({
            "id": f"{dtype}_{i}",
            "text": text,
            "metadata": metadata
        })

add_to_rag(clean_pdf_docs, "pdf")
add_to_rag(clean_ppt_docs, "ppt")
add_to_rag(clean_docx_docs, "docx")
add_to_rag(calendar_info, "calendar")
add_to_rag(webpage_info, "webpage")
add_to_rag(external_resources, "external_reference")

#### 7.2 Rechunking to make sure chunk sizes are normalized

In [28]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

final_rag_docs = []
for doc in rag_docs:
    for i, chunk in enumerate(splitter.split_text(doc["text"])):
        final_rag_docs.append({
            "id": f"{doc['id']}_chunk{i}",
            "text": chunk,
            "metadata": doc["metadata"]
        })

#### 7.3 Saving to disk

In [29]:
import json
import datetime

# Save to JSON, each document as a line
# Add today's date to the filename
today_str = datetime.date.today().strftime("%Y%m%d")
save_path = op.join(OUTPUT_DATA_DIR, f"rag_ready_docs_{today_str}.jsonl")
with open(save_path, "w", encoding="utf-8") as f:
    for doc in final_rag_docs:
        f.write(json.dumps(doc, ensure_ascii=False) + "\n")