# Create Synthetic Data for Student-College Choices
This dataset will represent the basis for the college recommmendations. In practice, we would gather  
the student-college information for those students who has 'successful' college outcomes at  
selective colleges.


In [10]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors


### Get a list of 'approved' college IDs from a CSV
This CSV comes from the IPEDS information and only those schools that have a Type 1 'selectivity' have  
been chosen for this file. From there, the randomly-assigned top-5 schools from each state are in  
this file.

In [42]:
raw_data_path = '../data/working_data/collegerecs_synthetic/selected_colleges.csv'

selected_colleges = pd.read_csv(raw_data_path)

selected_colleges.head()


Unnamed: 0,school_id,school_name,avg_total_sat,avg_total_act
0,100751,The University of Alabama,1105,27.0
1,100858,Auburn University,1160,27.0
2,100937,Birmingham-Southern College,1085,26.0
3,102049,Samford University,1129,26.0
4,102234,Spring Hill College,1095,25.0


In [47]:
faker = Faker()

# Parameters
num_students = 10000
majors = ["Computer Science", "Biology", "Engineering", "Business", "Psychology", "Nursing", "Art", "Economics"]

# Generate list of synthetic colleges
college_names = selected_colleges['school_name'].unique().tolist()

# Create synthetic student records
students = []
for student_id in range(1, num_students + 1):
    college = random.choice(college_names)
    sat = selected_colleges.loc[selected_colleges['school_name'] == college, 'avg_total_sat'].iloc[0]
    gpa = np.round(np.random.normal(3.4, 0.4), 2)
    city = faker.city()
    state = faker.state_abbr()
    major = random.choice(majors)

    students.append({
        "student_id": student_id,
        "SAT": sat,
        "GPA": max(min(gpa, 4.0), 0.0),  # Clamp to [0, 4.0]
        "city": city,
        "state": state,
        "major": major,
        "college_name": college
    })

df_students = pd.DataFrame(students)
df_students.head()

Unnamed: 0,student_id,SAT,GPA,city,state,major,college_name
0,1,1383,2.84,Sparkschester,PA,Engineering,University of Illinois Urbana-Champaign
1,2,1430,3.08,West Davidmouth,VI,Computer Science,Carleton College
2,3,1164,2.87,Watkinsland,WY,Computer Science,Butler University
3,4,1045,3.07,Colemanstad,NE,Engineering,Albion College
4,5,1375,3.41,South Sandra,VT,Psychology,Macalester College


## Preprocess features to prep for collaborative filtering

In [48]:
# Example student data
# This is important!
feature_cols = ["SAT", "GPA"] + [col for col in df_encoded.columns if col.startswith("major_")]

# Encode categorical features like major
df_encoded = pd.get_dummies(df_students, columns=["major"])

# Standardize numeric features
scaler = StandardScaler()
X = scaler.fit_transform(df_encoded[feature_cols])


## Find similar students

In [49]:
def find_similar_students(target_student, df_encoded, scaler, feature_cols, n_neighbors=25):
    # 1. Convert the target student to a DataFrame
    student_df = pd.DataFrame([target_student])
    student_df = pd.get_dummies(student_df)

    # 2. Add missing columns that exist in the training data
    for col in feature_cols:
        if col not in student_df:
            student_df[col] = 0

    # 3. Ensure the column order matches training
    student_df = student_df[feature_cols]

    # 4. Transform with the same scaler
    student_X = scaler.transform(student_df)

    # 5. Run KNN
    knn = NearestNeighbors(n_neighbors=n_neighbors)
    knn.fit(X)  # X is your already scaled training data
    distances, indices = knn.kneighbors(student_X)

    similar_students = df_students.iloc[indices[0]]
    return similar_students

def get_college_stats(similar_students):
    return similar_students.groupby("college_name").agg({
        "SAT": "mean",
        "student_id": "count"
    }).rename(columns={"SAT": "college_avg_SAT", "student_id": "num_similar_students"}).reset_index()
    
def classify_colleges(college_df, target_sat, top_n=3):
    def classify(row):
        delta = row["college_avg_SAT"] - target_sat
        if delta <= -20:
            return "Foundation"
        elif -10 <= delta <= 10:
            return "Thrive"
        elif delta >= 20:
            return "Aspire"
        else:
            return "Neutral"

    college_df["category"] = college_df.apply(classify, axis=1)

    results = {}
    for category in ["Foundation", "Thrive", "Aspire"]:
        subset = college_df[college_df["category"] == category] \
                    .sort_values(by="num_similar_students", ascending=False) \
                    .head(top_n)
        results[category] = subset[["college_name", "college_avg_SAT", "num_similar_students", "category"]]

    return results


In [54]:
target_student = {
    "SAT": 1400,
    "GPA": 3.6,
    "major": "Nursing"
}

similar_students = find_similar_students(target_student, df_encoded, scaler, feature_cols)
college_df = get_college_stats(similar_students)
recommendations = classify_colleges(college_df, target_student["SAT"], top_n=3)

# Show output
for cat, df in recommendations.items():
    print(f"\n📘 {cat} Colleges:")
    print(df.to_string(index=False))



📘 Foundation Colleges:
  college_name  college_avg_SAT  num_similar_students   category
Boston College           1360.0                     2 Foundation
  Reed College           1380.0                     2 Foundation
 Colby College           1371.0                     1 Foundation

📘 Thrive Colleges:
                               college_name  college_avg_SAT  num_similar_students category
                           Emory University           1395.0                     3   Thrive
Georgia Institute of Technology-Main Campus           1410.0                     3   Thrive
          University of California-Berkeley           1405.0                     3   Thrive

📘 Aspire Colleges:
         college_name  college_avg_SAT  num_similar_students category
Georgetown University           1420.0                     1   Aspire
     Grinnell College           1425.0                     1   Aspire
