In [41]:
import os
import pandas as pd
import re
from IPython.display import display

# ---------------------------
# Utility: Check if string looks like a link
# ---------------------------


def is_link(value: str) -> bool:
    if pd.isna(value):
        return False
    return bool(re.match(r'^https?://', str(value).strip()))


# ---------------------------
# Validate output folders (basic checks: file count, empty)
# ---------------------------
def validate_output(output_base: str):
    report_data = []

    for folder_name in os.listdir(output_base):
        folder_path = os.path.join(output_base, folder_name)
        if not os.path.isdir(folder_path):
            continue

        # Collect CSVs in this folder
        csv_files = [f for f in os.listdir(
            folder_path) if f.lower().endswith(".csv")]
        csv_count = len(csv_files)

        # Default status
        status = "GOOD" if csv_count == 1 else "BAD"
        note = ""

        if csv_count == 0:
            note = "No CSV found"
        elif csv_count > 1:
            note = f"Multiple CSVs found ({csv_count})"

        # Empty file check
        for csv_file in csv_files:
            csv_path = os.path.join(folder_path, csv_file)
            if os.path.getsize(csv_path) == 0:
                status = "BAD"
                note += f" | Empty file: {csv_file}"

        report_data.append({
            "Folder": folder_name,
            "CSV_Files": ", ".join(csv_files) if csv_files else "None",
            "CSV_Count": csv_count,
            "Status": status,
            "Notes": note.strip(" |")
        })

    df_report = pd.DataFrame(report_data)
    print("📑 Folder Validation Report")
    display(df_report)
    return df_report


# ---------------------------
# Validate CSV contents (links + empty check in first column)
# ---------------------------
def validate_csv_contents(output_base: str):
    content_issues = []
    headers_collection = []

    for folder_name in os.listdir(output_base):
        folder_path = os.path.join(output_base, folder_name)
        if not os.path.isdir(folder_path):
            continue

        csv_files = [f for f in os.listdir(
            folder_path) if f.lower().endswith(".csv")]
        for csv_file in csv_files:
            csv_path = os.path.join(folder_path, csv_file)

            try:
                try:
                    df = pd.read_csv(csv_path, encoding="utf-8")
                except UnicodeDecodeError:
                    df = pd.read_csv(csv_path, encoding="latin1")

                # Collect headers
                headers_collection.append({
                    "Folder": folder_name,
                    "CSV_File": csv_file,
                    "Headers": list(df.columns)
                })

                if df.shape[1] > 0:
                    first_col = df.iloc[:, 0]

                    # Check for empty values (excluding header)
                    empty_count = first_col.isna().sum()
                    if empty_count > 0:
                        content_issues.append({
                            "Folder": folder_name,
                            "CSV_File": csv_file,
                            "Issue": f"{empty_count} empty values in first column"
                        })

                    # Check for invalid (non-link) values
                    invalid = [v for v in first_col.dropna() if not is_link(v)]
                    if invalid:
                        content_issues.append({
                            "Folder": folder_name,
                            "CSV_File": csv_file,
                            "Issue": f"{len(invalid)} non-link values in first column"
                        })

            except Exception as e:
                content_issues.append({
                    "Folder": folder_name,
                    "CSV_File": csv_file,
                    "Issue": f"Failed to read CSV - {str(e)}"
                })

    # Build DataFrames
    df_issues = pd.DataFrame(content_issues)
    df_headers = pd.DataFrame(headers_collection)

    print("⚠️ Content Validation Issues")
    display(df_issues if not df_issues.empty else pd.DataFrame(
        [{"Status": "No issues found ✅"}]))

    print("\n📊 CSV Headers Overview")
    display(df_headers)

    return df_issues, df_headers


# ---------------------------
# Run full validation
# ---------------------------
OUTPUT_DIR = "Downloaded_Universities"

# Step 1: Validate folder/file structure
df_report = validate_output(OUTPUT_DIR)

# Step 2: Validate CSV contents
df_issues, df_headers = validate_csv_contents(OUTPUT_DIR)

📑 Folder Validation Report


Unnamed: 0,Folder,CSV_Files,CSV_Count,Status,Notes
0,1-aZqxTN1RpLwmfwxeGCMvmDiZo5mpKsS,Okanagan College Courses.csv,1,GOOD,
1,1-pYDlcnnI_ZsHLcBnRMNjrCZZIW3HAk4,University of Rhode Island Courses.csv,1,GOOD,
2,1-xfPxe6bhebWSWOJX22RVyPqNqveQZDq,Craleton University Courses.csv,1,GOOD,
3,1-Y5aXdseCkhY_AiqhIFmQcSl4xGbWOGV,University of Bologna Courses.csv,1,GOOD,
4,10IZEzyvflhrSY3BU-6cGpvz7QWdMVR9O,Free University of Berlin Courses.csv,1,GOOD,
...,...,...,...,...,...
347,University of Galway,University of Galway.csv,1,GOOD,
348,University of Lethbridge,University of Lethbridge.csv,1,GOOD,
349,University of Limerick,University of Limerick.csv,1,GOOD,
350,University of New Burnswick,University of New Burnswick.csv,1,GOOD,


⚠️ Content Validation Issues


Unnamed: 0,Status
0,No issues found ✅



📊 CSV Headers Overview


Unnamed: 0,Folder,CSV_File,Headers
0,1-aZqxTN1RpLwmfwxeGCMvmDiZo5mpKsS,Okanagan College Courses.csv,"[Course Link, Course Title, Required IELTS Sco..."
1,1-pYDlcnnI_ZsHLcBnRMNjrCZZIW3HAk4,University of Rhode Island Courses.csv,"[Course Link, Course Title, Required IELTS Sco..."
2,1-xfPxe6bhebWSWOJX22RVyPqNqveQZDq,Craleton University Courses.csv,"[Course Link, Course Title, Required IELTS Sco..."
3,1-Y5aXdseCkhY_AiqhIFmQcSl4xGbWOGV,University of Bologna Courses.csv,"[Course Link, Course Title, Required IELTS Sco..."
4,10IZEzyvflhrSY3BU-6cGpvz7QWdMVR9O,Free University of Berlin Courses.csv,"[Course Link, Course Title, Required IELTS Sco..."
...,...,...,...
347,University of Galway,University of Galway.csv,"[Course Link, Course Title, Required IELTS Sco..."
348,University of Lethbridge,University of Lethbridge.csv,"[Course Link, Course Title, Required IELTS Sco..."
349,University of Limerick,University of Limerick.csv,"[Course Link, Course Title, Required IELTS Sco..."
350,University of New Burnswick,University of New Burnswick.csv,"[Course Link, Course Title, Required IELTS Sco..."
