In [2]:
import pandas as pd
import random
import os

# ========================
# 1. Save College Master List (60 top Karnataka colleges)
# ========================
colleges_info = [
    ("RVCE", "R V College of Engineering", "Bangalore"),
    ("BMSCE", "B M S College of Engineering", "Bangalore"),
    ("MSRIT", "M S Ramaiah Institute of Technology", "Bangalore"),
    ("DSCE", "Dayananda Sagar College of Engineering", "Bangalore"),
    ("PESU", "PES University", "Bangalore"),
    ("UVCE", "University Visvesvaraya College of Engineering", "Bangalore"),
    ("BIT", "Bangalore Institute of Technology", "Bangalore"),
    ("KLEIT", "K L E Institute of Technology", "Hubli"),
    ("MVJCE", "M V J College of Engineering", "Bangalore"),
    ("SIT", "Siddaganga Institute of Technology", "Tumkur"),
    ("JSSSTU", "JSS Science & Technology University", "Mysore"),
    ("NIE", "National Institute of Engineering", "Mysore"),
    ("NHCE", "New Horizon College of Engineering", "Bangalore"),
    ("NMIT", "Nitte Meenakshi Institute of Technology", "Bangalore"),
    ("BNMIT", "B N M Institute of Technology", "Bangalore"),
    ("BMSIT", "B M S Institute of Technology & Management", "Bangalore"),
    ("REVA", "REVA University", "Bangalore"),
    ("MSRUAS", "MS Ramaiah University of Applied Sciences", "Bangalore"),
    ("PESCE", "P E S College of Engineering", "Mandya"),
    ("PRES", "Presidency University", "Bangalore"),
    ("DSATM", "Dayananda Sagar Academy of Technology & Management", "Bangalore"),
    ("RVUNIV", "R V University", "Bangalore"),
    ("SDMCET", "Sri Dharmasthala Manjunatheshwara College of Engineering & Technology", "Dharwad"),
    ("VVCE", "Vidyavardhaka College of Engineering", "Mysore"),
    ("CMRIT", "CMR Institute of Technology", "Bangalore"),
    ("SJBIT", "S J B Institute of Technology", "Bangalore"),
    ("AIT", "Dr Ambedkar Institute of Technology", "Bangalore"),
    ("GIT", "Global Institute of Technology", "Ramohalli"),
    ("RNSIT", "R V Narsimha Institute of Technology", "Bangalore"),
    ("AEC", "Acharya Engineering College", "Bangalore"),
    ("NCET", "Nitte Composite Engineering College", "Mangalore"),
    ("SJBCE", "St. Joseph Engineering College", "Mangalore"),
    ("AECMandya", "Acharya Engineering College, Mandya", "Mandya"),
    ("CITGubbi", "CIT Gubbi", "Tumkur"),
    ("GMIT", "Govt. ML Khalsa Institute of Technology", "Hubli"),
    ("BMIT", "Ballari Institute of Technology & Management", "Ballari"),
    ("KSIT", "K S Institute of Technology", "Bangalore"),
    ("YKCE", "Yenepoya College of Engineering", "Moodbidri"),
    ("RIT", "Rajarajeswari Institute of Technology", "Bangalore"),
    ("SCEM", "St Joseph’s College of Engineering", "Chikmagalur"),
    ("JNNCE", "JNN College of Engineering", "Shimoga"),
    ("SITM", "Sahyadri Institute of Technology & Management", "Mangalore"),
    ("ACU", "Acharya College of Engineering", "Bangalore"),
    ("VVIT", "Visvesvaraya Vidyalaya College of Engineering", "Bangalore"),
    ("BITM", "Basava Institute of Technology & Management", "Bagalkot"),
    ("JIT", "Jain Institute of Technology", "Davangere"),
    ("DBIT", "Dudhsagar Business Institute of Technology", "Bangalore"),
    ("VemanaIT", "Vemana Institute of Technology", "Bangalore"),
    ("AITM", "Adichunchanagiri Institute of Technology", "Mandya"),
    ("VIT", "Vijayanagara Institute of Technology", "Mysore"),
    ("HKBK", "H K Bharadwaj College of Engineering", "Bangalore"),
    ("DrAIT", "Dr Ambedkar Institute of Technology", "Bangalore"),
    ("RAIT", "Rajarshi College of Engineering", "Bangalore"),
    ("EastPoint", "East Point College of Engineering & Technology", "Bangalore"),
    ("VSM", "Vidya Vikas Institute of Technology", "Hubli"),
    ("SDMIT", "SDM Institute of Technology", "Dharmasthala"),
    ("KLSGIT", "KLS Gogte Institute of Technology", "Belgaum")
]

df_colleges = pd.DataFrame(colleges_info, columns=["Code", "Name", "City"])
df_colleges.to_csv("college_list.csv", index=False)
print("✅ college_list.csv created with", len(df_colleges), "colleges")

# ========================
# 2. Generate synthetic cutoff_20XX.csv files (2020–2025)
# ========================

# Extract only codes
colleges = df_colleges["Code"].tolist()

# Full branch names used directly
branches = [
    "Computer Science and Engineering",
    "Information Science and Engineering",
    "Electronics and Communication Engineering",
    "Electrical and Electronics Engineering",
    "Mechanical Engineering",
    "Civil Engineering",
    "Artificial Intelligence and Data Science",
    "Biotechnology",
    "Data Science",
    "Robotics and Automation Engineering"
]

categories = ["GM", "OBC", "SC", "ST", "1G", "2A", "2B", "3A", "3B", "EWS", "GMK"]
rounds = ["Mock", "Round 1", "Round 2", "Extended"]
years = list(range(2020, 2026))

# Output dir
os.makedirs("cutoff_data", exist_ok=True)

# Rank generator
def get_random_rank(branch, category, year):
    base = {
        "Computer Science and Engineering": 2500,
        "Information Science and Engineering": 4500,
        "Electronics and Communication Engineering": 6500,
        "Electrical and Electronics Engineering": 9500,
        "Mechanical Engineering": 15000,
        "Civil Engineering": 18000,
        "Artificial Intelligence and Data Science": 7000,
        "Biotechnology": 20000,
        "Data Science": 7200,
        "Robotics and Automation Engineering": 8000
    }[branch]
    cat_offset = {
        "GM": 0, "OBC": 2500, "SC": 8000, "ST": 10000, "EWS": 1500, "GMK": 1700,
        "1G": 3000, "2A": 2000, "2B": 2500, "3A": 1500, "3B": 1800
    }[category]
    year_drift = (2025 - year) * 200
    return max(100, min(base + cat_offset + year_drift + random.randint(-800, 1200), 100000))

# Generate per year
for year in years:
    rows = []
    for clg in colleges:
        for branch in branches:
            for cat in categories:
                for rnd in rounds:
                    cutoff = get_random_rank(branch, cat, year)
                    rows.append([year, rnd, clg, branch, cat, cutoff])
    df = pd.DataFrame(rows, columns=["Year", "Round", "College", "Branch", "Category", "Cutoff_Rank"])
    df.to_csv(f"cutoff_data/cutoff_{year}.csv", index=False)
    print(f"✅ cutoff_{year}.csv with {len(df)} rows saved")

print("\n🎯 All KCET synthetic data (2020–2025) with full branch names generated!")


✅ college_list.csv created with 57 colleges
✅ cutoff_2020.csv with 25080 rows saved
✅ cutoff_2021.csv with 25080 rows saved
✅ cutoff_2022.csv with 25080 rows saved
✅ cutoff_2023.csv with 25080 rows saved
✅ cutoff_2024.csv with 25080 rows saved
✅ cutoff_2025.csv with 25080 rows saved

🎯 All KCET synthetic data (2020–2025) with full branch names generated!


In [3]:
from google.colab import files
files.download("college_list.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
files.download("cutoff_data/cutoff_2020.csv")
files.download("cutoff_data/cutoff_2021.csv")
files.download("cutoff_data/cutoff_2022.csv")
files.download("cutoff_data/cutoff_2023.csv")
files.download("cutoff_data/cutoff_2024.csv")
files.download("cutoff_data/cutoff_2025.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
import pandas as pd
import random
import os

# ==========================================
# CONFIG: Load existing 60 colleges
# ==========================================
df_colleges = pd.read_csv("college_list.csv")  # Should exist in your dir
colleges = df_colleges["Code"].tolist()

# ==========================================
# CONFIG: COMEDK-specific setup
# ==========================================
branches = [
    "Computer Science and Engineering",
    "Information Science and Engineering",
    "Electronics and Communication Engineering",
    "Electrical and Electronics Engineering",
    "Mechanical Engineering",
    "Civil Engineering",
    "Artificial Intelligence and Data Science",
    "Biotechnology",
    "Data Science",
    "Robotics and Automation Engineering"
]

categories = ["GM", "HKR", "Tulu", "Christian", "Muslim", "Others"]
rounds = ["Mock", "Round 1", "Round 2", "Extended"]
years = list(range(2020, 2026))  # 2020 to 2025

# ==========================================
# RANK SIMULATOR FOR COMEDK
# ==========================================
def get_random_comedk_rank(branch, category, year):
    base = {
        "Computer Science and Engineering": 3000,
        "Information Science and Engineering": 5000,
        "Electronics and Communication Engineering": 6500,
        "Electrical and Electronics Engineering": 8000,
        "Mechanical Engineering": 13000,
        "Civil Engineering": 15000,
        "Artificial Intelligence and Data Science": 6000,
        "Biotechnology": 14000,
        "Data Science": 6200,
        "Robotics and Automation Engineering": 8500
    }[branch]

    cat_offset = {
        "GM": 0,
        "HKR": 1200,
        "Tulu": 2000,
        "Christian": 2500,
        "Muslim": 2700,
        "Others": 3000
    }[category]

    year_drift = (2025 - year) * 100
    return max(100, min(base + cat_offset + year_drift + random.randint(-800, 1000), 100000))

# ==========================================
# GENERATE AND SAVE COMEDK DATA
# ==========================================
os.makedirs("data/cutoffs", exist_ok=True)

for year in years:
    records = []
    for clg in colleges:
        for branch in branches:
            for cat in categories:
                for rnd in rounds:
                    rank = get_random_comedk_rank(branch, cat, year)
                    records.append([year, rnd, clg, branch, cat, rank])
    df = pd.DataFrame(records, columns=["Year", "Round", "College", "Branch", "Category", "Cutoff_Rank"])
    df.to_csv(f"data/cutoffs/comedk_{year}.csv", index=False)
    print(f"✅ comedk_{year}.csv saved with {len(df)} rows")

print("\n🎯 All COMEDK cutoff files (2020–2025) generated — total ≈ 86,400 rows")


✅ comedk_2020.csv saved with 13680 rows
✅ comedk_2021.csv saved with 13680 rows
✅ comedk_2022.csv saved with 13680 rows
✅ comedk_2023.csv saved with 13680 rows
✅ comedk_2024.csv saved with 13680 rows
✅ comedk_2025.csv saved with 13680 rows

🎯 All COMEDK cutoff files (2020–2025) generated — total ≈ 86,400 rows


In [8]:
files.download("data/cutoffs/comedk_2020.csv")
files.download("data/cutoffs/comedk_2021.csv")
files.download("data/cutoffs/comedk_2022.csv")
files.download("data/cutoffs/comedk_2023.csv")
files.download("data/cutoffs/comedk_2024.csv")
files.download("data/cutoffs/comedk_2025.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
import pandas as pd
import random
import os

# Load colleges
df_colleges = pd.read_csv("college_list.csv")
colleges = df_colleges["Code"].tolist()

# Config
branches = [
    "Computer Science and Engineering",
    "Information Science and Engineering",
    "Electronics and Communication Engineering",
    "Electrical and Electronics Engineering",
    "Mechanical Engineering",
    "Civil Engineering",
    "Artificial Intelligence and Data Science",
    "Biotechnology",
    "Data Science",
    "Robotics and Automation Engineering",
    "Aerospace Engineering",
    "Chemical Engineering"
]

categories = ["GM", "OBC", "SC", "ST", "Tulu", "Christian"]
exams = ["KCET", "COMEDK"]
years = [2023, 2024]

# Generate synthetic seat matrix
records = []
for clg in colleges:
    for branch in branches:
        for cat in categories:
            for exam in exams:
                for year in years:
                    seats = random.randint(6, 60)
                    records.append([clg, branch, cat, exam, year, seats])

df_seats = pd.DataFrame(records, columns=["College", "Branch", "Category", "Exam", "Year", "Total_Seats"])

# Shuffle and trim to 14,400 rows
df_seats = df_seats.sample(n=14400, random_state=42).reset_index(drop=True)

# Save
os.makedirs("data", exist_ok=True)
df_seats.to_csv("data/seat_matrix.csv", index=False)

print("✅ seat_matrix.csv generated with 14,400 rows.")


✅ seat_matrix.csv generated with 14,400 rows.


In [10]:
files.download("data/seat_matrix.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
import pandas as pd
import random
import os

# Load seat matrix
df_seats = pd.read_csv("data/seat_matrix.csv")

# Helper to assign fee slabs based on exam
def generate_fees(exam):
    if exam == "KCET":
        tuition = random.randint(50000, 85000)
    else:  # COMEDK
        tuition = random.randint(160000, 250000)

    hostel = random.randint(60000, 95000)
    misc = random.randint(8000, 20000)
    one_time = random.randint(10000, 30000)
    total_first_year = tuition + hostel + misc + one_time
    total_annual = tuition + hostel + misc
    scholarship = random.choice(["Yes", "No"])

    return pd.Series([tuition, hostel, misc, one_time, total_first_year, total_annual, scholarship],
                     index=["Tuition_Fee", "Hostel_Fee", "Misc_Fee", "OneTime_Fee",
                            "Total_First_Year", "Total_Annual", "Scholarship_Eligible"])

# Apply fees to each row
df_fees = df_seats[["College", "Branch", "Exam", "Year"]].copy()
df_fees[["Tuition_Fee", "Hostel_Fee", "Misc_Fee", "OneTime_Fee",
         "Total_First_Year", "Total_Annual", "Scholarship_Eligible"]] = df_fees["Exam"].apply(generate_fees)

# Save to CSV
os.makedirs("data", exist_ok=True)
df_fees.to_csv("data/fees.csv", index=False)
print(f"✅ fees.csv generated with {len(df_fees)} rows.")


✅ fees.csv generated with 14400 rows.


In [12]:
files.download("data/fees.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
import pandas as pd
import random
import os

# Load college list
df_colleges = pd.read_csv("college_list.csv")
colleges = df_colleges["Code"].tolist()

branches = [
    "Computer Science and Engineering",
    "Information Science and Engineering",
    "Electronics and Communication Engineering",
    "Electrical and Electronics Engineering",
    "Mechanical Engineering",
    "Civil Engineering",
    "Artificial Intelligence and Data Science",
    "Biotechnology",
    "Data Science",
    "Robotics and Automation Engineering",
    "Aerospace Engineering",
    "Chemical Engineering"
]

years = [2023, 2024]

company_pool = {
    "Tech": ["Google", "Microsoft", "Adobe", "Cisco", "Infosys", "Wipro", "TCS", "SAP Labs", "Amazon", "Flipkart"],
    "Core": ["Bosch", "Tata Motors", "L&T", "Siemens", "ABB", "GE", "HAL", "BEL"],
    "Bio": ["Biocon", "Serum Institute", "Cipla", "Novartis", "Pfizer"],
    "Aero": ["DRDO", "ISRO", "Airbus", "Boeing", "Safran"]
}

def get_companies(branch):
    if "Computer" in branch or "Data" in branch or "AI" in branch:
        return random.sample(company_pool["Tech"], k=3)
    elif "Mech" in branch or "Civil" in branch or "Elec" in branch:
        return random.sample(company_pool["Core"], k=3)
    elif "Bio" in branch or "Chemical" in branch:
        return random.sample(company_pool["Bio"], k=3)
    elif "Aero" in branch or "Robotics" in branch:
        return random.sample(company_pool["Aero"], k=3)
    else:
        return random.sample(company_pool["Tech"] + company_pool["Core"], k=3)

# Generate records
records = []
for clg in colleges:
    for branch in branches:
        for year in years:
            avg_package = round(random.uniform(3.0, 10.0), 1)
            max_package = round(avg_package + random.uniform(5, 45), 1)
            nirf_rank = random.randint(50, 300)
            top_companies = ", ".join(get_companies(branch))
            records.append([clg, branch, year, avg_package, max_package, nirf_rank, top_companies])

# Save DataFrame
df_place = pd.DataFrame(records, columns=[
    "College", "Branch", "Year", "Avg_Package_LPA",
    "Max_Package_LPA", "NIRF_Rank", "Top_Companies"
])

os.makedirs("data", exist_ok=True)
df_place.to_csv("data/placements.csv", index=False)

print(f"✅ placements.csv generated with {len(df_place)} rows.")


✅ placements.csv generated with 1368 rows.


In [14]:
files.download("data/placements.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>