In [1]:
import pandas as pd
import requests
import re
import time
from tqdm import tqdm
import html
import json

In [2]:
path = "/Users/alessandromolinarroet/Desktop/programming_database_web_applications_2/database/data/tum_course_data/tum_courses_step1_collection.csv"
df = pd.read_csv(path)

In [14]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'application/json'
}
descriptions = []

for index, row in tqdm(df.iterrows(), total=len(df), desc="Scraping TUM Descriptions"):
    course_url = str(row['URL'])
    
    try:
        course_id_match = re.search(r'courses/(\d+)', course_url)
        if not course_id_match:
            descriptions.append("ID not found in URL")
            continue
        course_id = course_id_match.group(1)
        api_url = f"https://campus.tum.de/tumonline/ee/rest/slc.tm.cp/student/courses/{course_id}"
        
        # Fetch data from API
        response = requests.get(api_url, headers=headers, timeout=10)
        if response.status_code != 200:
            descriptions.append(f"Error: Status {response.status_code}")
            continue
        data = response.json()
        
        resources = data.get('resource', [])
        if not resources:
            descriptions.append("No data available")
            continue
        
        description_data = (
            resources[0]
            .get('content', {})
            .get('cpCourseDetailDto', {})
            .get('cpCourseDescriptionDto', {})
        )
        
        content_field = description_data.get('courseContent', {})
        content = content_field.get('value', '')  # Default value
        
        content_translations = content_field.get('translations', {}).get('translation', [])
        for trans in content_translations:
            if trans.get('lang') == 'en' and trans.get('value'):
                content = trans.get('value')
                break
        
        objectives_field = description_data.get('courseObjective', {})
        objectives = objectives_field.get('value', '')
        
        objectives_translations = objectives_field.get('translations', {}).get('translation', [])
        for trans in objectives_translations:
            if trans.get('lang') == 'en' and trans.get('value'):
                objectives = trans.get('value')
                break

        parts = []
        if content and len(content) > 1:
            parts.append(f"CONTENT:\n{content}")
        if objectives and len(objectives) > 1:
            parts.append(f"OBJECTIVES:\n{objectives}")
        
        if parts != []:
            final_description = "\n\n".join(parts) 
        else: 
            final_description = "No information available"
        descriptions.append(final_description)

        
    except Exception as e:
        print(f"Error processing {course_url}: {e}")
        descriptions.append(f"Error: {str(e)}")


df['Description'] = descriptions
df.to_csv("courses.csv", index=False)


Scraping TUM Descriptions: 100%|██████████| 3768/3768 [11:23<00:00,  5.51it/s]


In [2]:
path = "./courses_progress.csv"
df = pd.read_csv(path)
(df["Description"] == "No information available").value_counts()
df.head()

Unnamed: 0,Title,Semester,Description,Skills,URL
0,Dodo Alive! - Resurrecting the Dodo with Robot...,2025 W,CONTENT:\nImagine that you are at the natural ...,"['MATLAB', 'Simulink', 'AutoCAD', 'SolidWorks'...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
1,Biochemie reaktiver Sauerstoffspezies und frei...,2025 W,CONTENT:\nThe main focus of the course is on b...,"['Analytics', 'Bioconductor', 'Python', 'R', '...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
2,Instationäre hygrothermische Berechnungsverfahren,2025 W,No information available,"['ANSYS', 'AutoCAD', 'AutoCAD Civil 3D', 'ETAP...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
3,A Different Kind of Game Jam! Reflecting Scien...,2025 W,CONTENT:\nThis course offers an interdisciplin...,"['Analytics', 'Business software applications'...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
4,"A Google-Earth Perspective on Nutrition, Healt...",2025 W,No information available,"['Google', 'Earth Pro', 'GIS software', 'ArcGI...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...


In [3]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set")


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [8]:
for index in range(len(df)):
    df.at[index,"Skills"] = []
df.head()

Unnamed: 0,Title,Semester,Description,Skills,URL
0,Dodo Alive! - Resurrecting the Dodo with Robot...,2025 W,CONTENT:\nImagine that you are at the natural ...,[],https://campus.tum.de/tumonline/ee/ui/ca2/app/...
1,Biochemie reaktiver Sauerstoffspezies und frei...,2025 W,CONTENT:\nThe main focus of the course is on b...,[],https://campus.tum.de/tumonline/ee/ui/ca2/app/...
2,Instationäre hygrothermische Berechnungsverfahren,2025 W,No information available,[],https://campus.tum.de/tumonline/ee/ui/ca2/app/...
3,A Different Kind of Game Jam! Reflecting Scien...,2025 W,CONTENT:\nThis course offers an interdisciplin...,[],https://campus.tum.de/tumonline/ee/ui/ca2/app/...
4,"A Google-Earth Perspective on Nutrition, Healt...",2025 W,No information available,[],https://campus.tum.de/tumonline/ee/ui/ca2/app/...


In [11]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
model = genai.GenerativeModel("gemini-2.5-flash-lite")

BATCH_SIZE = 50 
DAILY_LIMIT = 100 

with open("../extraction/lists/skill_areas.txt", 'r', encoding='utf-8') as f:
    skill_list = [line.strip() for line in f if line.strip()]

unprocessed_indices = df[df['Skills'].apply(lambda x: x == [])].index.tolist()
for i in tqdm(range(0, len(unprocessed_indices), BATCH_SIZE)):

    current_request_count = i // BATCH_SIZE
    if current_request_count >= DAILY_LIMIT:
        print(f"\nDaily quota of {DAILY_LIMIT} requests reached. Save and resume tomorrow!")
        break

    batch_indices = unprocessed_indices[i : i + BATCH_SIZE]
    batch_rows = df.loc[batch_indices]

    courses_text = ""
    for idx, row in batch_rows.iterrows():
        desc = str(row['Description'])[:350].replace("\n", " ")
        courses_text += f"ID: {idx} | Title: {row['Title']} | Desc: {desc}\n---\n"

    prompt = f"""
    You are analyzing university courses.

    Task:
    For EACH course, select the skills that are MOST LIKELY TAUGHT in the course.

    Rules:
    - Use ONLY skills from the Skills Bank.
    - Select between 1 and 10 skills per course.
    - It is OK to select skills from multiple areas.
    - Do NOT invent new skills.
    - Include only skills that are reasonably certain to be taught in the course.
    - Do NOT include a skill just because it could possibly be taught.
    - It is fine to include skills mentioned casually if they are actually part of the course content.
    - Prefer fewer skills with high confidence over many skills with low confidence.

    Skills Bank:
    {", ".join(skill_list)}

    Courses:
    {courses_text}

    Output format (STRICT):
    Return a JSON object ONLY.
    - Keys: course IDs (as strings)
    - Values: list of selected skills

    Example:
    {{
    "123": ["Python", "RESTful API", "JSON"],
    "124": ["C++", "Data Structures"]
    }}
    """

    try:
        response = model.generate_content(
            prompt, 
            generation_config={"response_mime_type": "application/json"}
        )
        batch_results = json.loads(response.text)
        
        for course_id_str, skills in batch_results.items():
            course_id = int(course_id_str)
            if course_id in df.index:
                df.at[course_id, "Skills"] = skills if isinstance(skills, list) else []
                
    except Exception as e:
        print(f"\nBatch starting at index {batch_indices[0]} failed: {e}")
        time.sleep(10)

    time.sleep(5) 

    # Saves 
    if (current_request_count + 1) % 2 == 0:
        df.to_csv("courses_progress.csv", index=False)


df.to_csv("courses_progress.csv", index=False)
print("Processing complete for today.")

100%|██████████| 73/73 [12:26<00:00, 10.22s/it]

Processing complete for today.





In [14]:
pd.set_option('display.max_rows', None)
skills_counts = df["Skills"].explode().value_counts()
print(skills_counts)

Skills
Analytical thinking and structured problem solving    292
Research-oriented mindset                             231
Python                                                225
Data analysis                                         209
Simulation software                                   163
Thermodynamics                                        125
MATLAB                                                118
Critical thinking and decision-making                 101
Statistical software                                   91
Clear technical and non-technical communication        82
Fluid mechanics                                        77
Analytics                                              71
Urban planning                                         69
R                                                      67
Project planning and prioritization                    62
Attention to detail and quality                        61
Structural analysis                                    60
Project

In [18]:
skill_to_find = "Machine learning"
courses_with_skill = df[df["Skills"].apply(lambda x: skill_to_find in x)]
print(courses_with_skill[["Title", "Description", "Skills"]])

                                                  Title  \
71          Advanced Robot Learning and Decision-Making   
109                               AI in and for Society   
115                  AI4EO Platforms and Best Practices   
321                                 Autonomous Vehicles   
482                     Brain-inspired Computing for AI   
566                                          Coding Lab   
570                      Colloquium Recommender Systems   
588                      Computational Materials Design   
655                          Cyber-Physical Systems Lab   
669                 Data Mining und Knowledge Discovery   
675                 Data Science in der Agrarinformatik   
687                         Deep Reinforcement Learning   
700                                 Design Expedition 2   
931   Einführung in Machinelles Lernen in den Materi...   
932   Einführung in neuronale Netzwerke für Studiere...   
933               Einführung in Probabilistic Reasoning 

In [17]:
path = "./final_courses.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Title,Semester,Description,Skills,URL
0,Dodo Alive! - Resurrecting the Dodo with Robot...,2025 W,CONTENT:\nImagine that you are at the natural ...,"['Robotics', 'AI', 'Python', 'C++', 'Firmware'...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
1,Biochemie reaktiver Sauerstoffspezies und frei...,2025 W,CONTENT:\nThe main focus of the course is on b...,"['Biochemistry', 'Antioxidants', 'Biochemistry...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
2,Instationäre hygrothermische Berechnungsverfahren,2025 W,No information available,"['Simulation software', 'Thermodynamics']",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
3,A Different Kind of Game Jam! Reflecting Scien...,2025 W,CONTENT:\nThis course offers an interdisciplin...,"['Game Design', 'Game Development', 'Social Sc...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
4,"A Google-Earth Perspective on Nutrition, Healt...",2025 W,No information available,"['GIS software', 'Nutrition', 'Health', 'Susta...",https://campus.tum.de/tumonline/ee/ui/ca2/app/...
