In [2]:
import pandas as pd
import os
import re

In [3]:
# Load the Coursera-only dataset

raw_dir = "../../data/raw"
file = "coursera_course_dataset_v3.csv"

file_path = os.path.join(raw_dir, file)
df= pd.read_csv(file_path)


In [4]:
def clean_text(text):
    """
    Basic text cleaning: lowercase, remove HTML-like tags, URLs, non-alphanumeric chars.
    """
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)         # Remove HTML tags
    text = re.sub(r"http\S+", " ", text)         # Remove URLs
    text = re.sub(r"[^a-z0-9\s]", " ", text)     # Remove punctuation/non-alphanum
    text = re.sub(r"\s+", " ", text).strip()     # Normalize whitespace
    return text

In [5]:
# Fill missing values to avoid string concatenation issues
df.fillna("", inplace=True)

In [6]:
# Normalize metadata fields
df["title"] = df["Title"]
df["description"] = df["course_description"]  


In [7]:
# Drop columns not needed for metadata
columns_to_drop = [
    "course_url", "Ratings", "Review Count", "course_students_enrolled"
]
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

In [8]:
# Combine and clean fields
df["text_for_embedding"] = (
    df["title"] + " " +
    df["Organization"] + " " +
    df["course_description"] + " " +
    df["Skills"] + " " +
    df["Difficulty"] + " " +
    df["Type"] + " " +
    df["Duration"]
).apply(clean_text)

In [9]:
# Save processed metadata
output_path = "data/interim/courses_metadata.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)

print(f"Cleaned metadata saved to: {output_path}")

Cleaned metadata saved to: data/interim/courses_metadata.csv
