# Create Synthetic Data for Student-College Choices
This dataset will represent the basis for the college recommmendations. In practice, we would gather  
the student-college information for those students who has 'successful' college outcomes at  
selective colleges.


In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors


### Get a list of 'approved' college IDs from a CSV
This CSV comes from the IPEDS information and only those schools that have a Type 1 'selectivity' have  
been chosen for this file. From there, the randomly-assigned top-5 schools from each state are in  
this file.

In [2]:
raw_data_path = '../data/working_data/collegerecs_synthetic/selected_colleges.csv'

selected_colleges = pd.read_csv(raw_data_path)

selected_colleges.head()


Unnamed: 0,school_id,school_name,avg_total_sat,avg_total_act
0,100751,The University of Alabama,1105,27.0
1,100858,Auburn University,1160,27.0
2,100937,Birmingham-Southern College,1085,26.0
3,102049,Samford University,1129,26.0
4,102234,Spring Hill College,1095,25.0


In [14]:
faker = Faker()

# Parameters
num_students = 20000
majors = ["Computer Science", "Biology", "Engineering", "Business", "Psychology", "Nursing", "Art", "Economics"]

# Generate list of synthetic colleges
college_names = selected_colleges['school_name'].unique().tolist()

# Create synthetic student records
students = []
for student_id in range(1, num_students + 1):
    college = random.choice(college_names)
    sat = selected_colleges.loc[selected_colleges['school_name'] == college, 'avg_total_sat'].iloc[0]
    gpa = np.round(np.random.normal(3.4, 0.4), 2)
    city = faker.city()
    state = faker.state_abbr()
    major = random.choice(majors)

    students.append({
        "student_id": student_id,
        "SAT": sat,
        "GPA": max(min(gpa, 4.0), 0.0),  # Clamp to [0, 4.0]
        "city": city,
        "state": state,
        "major": major,
        "college_name": college,
        # Add new importance rating fields (1-5 scale)
        "importance_close_to_home": random.randint(1, 5),
        "importance_school_reputation": random.randint(1, 5),
        "importance_school_cost": random.randint(1, 5)
    })

df_students = pd.DataFrame(students)
df_students.head()

Unnamed: 0,student_id,SAT,GPA,city,state,major,college_name,importance_close_to_home,importance_school_reputation,importance_school_cost
0,1,1230,2.7,Traciehaven,NJ,Psychology,New Mexico Institute of Mining and Technology,3,1,4
1,2,1220,3.88,Nicoleshire,WA,Biology,University of Maryland-Baltimore County,3,2,2
2,3,1315,3.51,Gibsonmouth,SD,Art,United States Air Force Academy,4,2,4
3,4,1371,3.24,New Lindafurt,KS,Biology,Colby College,5,4,1
4,5,1135,3.34,North Zachary,MH,Computer Science,University of Nebraska-Lincoln,4,3,3


## Preprocess features to prep for collaborative filtering

In [15]:
# Encode all categorical features
df_encoded = pd.get_dummies(df_students, columns=[
    "major",
    "importance_close_to_home",
    "importance_school_reputation",
    "importance_school_cost"
])

# Define feature columns to include all encoded columns
feature_cols = ["SAT", "GPA"] + \
               [col for col in df_encoded.columns if col.startswith("major_")] + \
               [col for col in df_encoded.columns if col.startswith("importance_")]

# Standardize numeric features
scaler = StandardScaler()
X = scaler.fit_transform(df_encoded[feature_cols])

## Find similar students

In [16]:
def find_similar_students(target_student, df_encoded, scaler, feature_cols, n_neighbors=25):
    """Find similar students based on academic and preference features.

    Args:
        target_student (dict): Dictionary containing student information with fields matching df_encoded
        df_encoded (pandas.DataFrame): DataFrame containing encoded student features
        scaler (sklearn.preprocessing.StandardScaler): Fitted scaler for numeric features
        feature_cols (list): List of feature column names to use for similarity matching
        n_neighbors (int, optional): Number of similar students to return. Defaults to 25.

    Returns:
        pandas.DataFrame: DataFrame containing information about the n_neighbors most similar students
    """
    # 1. Convert the target student to a DataFrame
    student_df = pd.DataFrame([target_student])
    student_df = pd.get_dummies(student_df)

    # 2. Add missing columns that exist in the training data
    for col in feature_cols:
        if col not in student_df:
            student_df[col] = 0

    # 3. Ensure the column order matches training
    student_df = student_df[feature_cols]

    # 4. Transform with the same scaler
    student_X = scaler.transform(student_df)

    # 5. Run KNN
    knn = NearestNeighbors(n_neighbors=n_neighbors)
    knn.fit(X)  # X is your already scaled training data
    distances, indices = knn.kneighbors(student_X)

    similar_students = df_students.iloc[indices[0]]
    return similar_students

def get_college_stats(similar_students):
    """Calculate aggregate statistics for colleges attended by similar students.

    Args:
        similar_students (pandas.DataFrame): DataFrame containing information about similar students,
            including their SAT scores and college choices

    Returns:
        pandas.DataFrame: DataFrame with college-level statistics including:
            - college_name: Name of the college
            - college_avg_SAT: Mean SAT score of similar students at that college
            - num_similar_students: Count of similar students attending that college
            - avg_importance_close_to_home: Mean importance of location for students at that college
            - avg_importance_school_reputation: Mean importance of reputation for students at that college
            - avg_importance_school_cost: Mean importance of cost for students at that college
    """
    return similar_students.groupby("college_name").agg({
        "SAT": "mean",
        "student_id": "count",
        "importance_close_to_home": "mean",
        "importance_school_reputation": "mean", 
        "importance_school_cost": "mean"
    }).rename(columns={
        "SAT": "college_avg_SAT",
        "student_id": "num_similar_students",
        "importance_close_to_home": "avg_importance_close_to_home",
        "importance_school_reputation": "avg_importance_school_reputation",
        "importance_school_cost": "avg_importance_school_cost"
    }).reset_index()
    
def classify_colleges(college_df, target_sat, top_n=3):
    """Classify colleges into Foundation, Thrive, and Aspire categories based on SAT scores.

    Args:
        college_df (pandas.DataFrame): DataFrame containing college statistics including:
            - college_name: Name of the college
            - college_avg_SAT: Mean SAT score of similar students at that college
            - num_similar_students: Count of similar students attending that college
            - avg_importance_close_to_home: Mean importance of location for students at that college
            - avg_importance_school_reputation: Mean importance of reputation for students at that college
            - avg_importance_school_cost: Mean importance of cost for students at that college
        target_sat (int): The target student's SAT score
        top_n (int, optional): Number of colleges to return per category. Defaults to 3.

    Returns:
        dict: Dictionary containing DataFrames for each category (Foundation, Thrive, Aspire)
            with the top_n colleges sorted by number of similar students. Each DataFrame includes:
            - college_name: Name of the college
            - college_avg_SAT: Mean SAT score of similar students
            - num_similar_students: Number of similar students at the college
            - avg_importance_close_to_home: Mean importance of location
            - avg_importance_school_reputation: Mean importance of reputation
            - avg_importance_school_cost: Mean importance of cost
            - category: Classification category (Foundation/Thrive/Aspire)
    """
    def classify(row):
        delta = row["college_avg_SAT"] - target_sat
        if -40 <= delta <= -20:
            return "Foundation"
        elif -10 <= delta <= 10:
            return "Thrive"
        elif 20 <= delta <= 40:
            return "Aspire"
        else:
            return "Neutral"

    college_df["category"] = college_df.apply(classify, axis=1)

    results = {}
    for category in ["Foundation", "Thrive", "Aspire"]:
        subset = college_df[college_df["category"] == category] \
                    .sort_values(by="num_similar_students", ascending=False) \
                    .head(top_n)
        results[category] = subset[["college_name", "college_avg_SAT", "num_similar_students", 
                                  "avg_importance_close_to_home", "avg_importance_school_reputation",
                                  "avg_importance_school_cost", "category"]]

    return results


### Test Functions with Mock Target Student

In [17]:
target_student = {
    "SAT": 1400,
    "GPA": 3.6,
    "major": majors[3],
    "city": "Seattle",
    "state": "WA",
    "importance_close_to_home": 5,
    "importance_school_reputation": 2,
    "importance_school_cost": 4
}

print(target_student)

similar_students = find_similar_students(target_student, df_encoded, scaler, feature_cols)
college_df = get_college_stats(similar_students)
recommendations = classify_colleges(college_df, target_student["SAT"], top_n=3)

# Show output
for cat, df in recommendations.items():
    print(f"\n📘 {cat} Colleges:")
    print(df.to_string(index=False))


{'SAT': 1400, 'GPA': 3.6, 'major': 'Business', 'city': 'Seattle', 'state': 'WA', 'importance_close_to_home': 5, 'importance_school_reputation': 2, 'importance_school_cost': 4}

📘 Foundation Colleges:
      college_name  college_avg_SAT  num_similar_students  avg_importance_close_to_home  avg_importance_school_reputation  avg_importance_school_cost   category
   Barnard College           1370.0                     3                           2.0                          3.666667                    1.666667 Foundation
      Reed College           1380.0                     3                           2.0                          2.333333                    4.666667 Foundation
Colgate University           1375.0                     2                           4.0                          4.000000                    3.000000 Foundation

📘 Thrive Colleges:
                     college_name  college_avg_SAT  num_similar_students  avg_importance_close_to_home  avg_importance_school_reputation