In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier , plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.impute import SimpleImputer  
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Read the math placement data from the Excel file
df_math_placement = pd.read_excel('math_placement2024-02-29.xlsx')

# Read the edready raw scores data from the CSV file
df_ed_ready = pd.read_csv('edready_raw_scores.csv')

# Load the file of incoming students
#incoming_students = pd.read_csv('incoming_students.csv')will need to change to correct file name


In [2]:
# Create a new dataframe to store the cleaned and imputed data
df_cleaned = df_math_placement.copy()

# Define the mapping of grades to grade categories
grade_mapping = {
    "A": "Successful", 
    "B": "Successful", 
    "B+": "Successful",
    "A-": "Successful", 
    "B*": "Successful", 
    "A*": "Successful",
    "B+*": "Successful", 
    "W": "Unsuccessful", 
    "C-": "Unsuccessful",
    "F": "Unsuccessful", 
    "B-": "Successful", 
    "C+": "Unsuccessful",
    "D": "Unsuccessful", 
    "D*": "Unsuccessful", 
    "C": "Unsuccessful",
    "W*": "Unsuccessful", 
    "P*": "Successful", 
    "C+*": "Unsuccessful",
    "F*": "Unsuccessful", 
    "D+": "Unsuccessful", 
    "P": "Successful",
    "C*": "Unsuccessful", 
    "A-*": "Successful", 
    "I": "Unsuccessful",
    "AU": "Ignore", 
    "B-*": "Successful",
    "D-": "Unsuccessful",
    "NR": "Ignore", 
    "C-*": "Unsuccessful", 
    "I*": "Unsuccessful",
    "NR*": "Ignore"
}

# Map the grades to their categories and append as a new column
df_cleaned['grade_category'] = df_cleaned['grade'].map(grade_mapping)


# Define base courses and adjust the list based on 'campus_code'
base_courses = ['088', '216Q', '132', '161Q', '151Q', '165Q', '171Q']
courses_to_check = ['105Q', '090', '121Q']

# Define a function to determine the course level based on the course number and campus code
for course in courses_to_check:
    if df_cleaned[(df_cleaned['campus_code'] != 'ZGC') & (df_cleaned['course_number'] == course)].shape[0] > 0:
        base_courses.append(course)

# Define course combinations
course_combinations = {
    'Combo1': ('005', '105Q'),
    'Combo2': ('063', '090'),
    'Combo3': ('021', '121Q')
}

# Map courses and combinations to levels
course_levels = {
    '088': 100,
    'Combo2': 100,  # ('063', '090')
    'Combo1': 150,  # ('005', '105Q')
    '090': 150,
    'Combo3': 250,  # ('021', '121Q')
    '105Q': 290,
    '216Q': 300,
    '132': 300,
    '121Q': 300,
    '161Q': 400,
    '151Q': 400,
    '165Q': 500,
    '171Q': 500
}

# Function to determine course level based on the course number or combination
def determine_course_level(course_number, campus_code):
    # Handle special cases for '105Q', '090', '121Q' based on 'campus_code'
    if course_number in ['105Q', '090', '121Q'] and campus_code == 'ZGC':
        # Exclude these courses for ZGC campus
        return None
    for combo_name, combo_courses in course_combinations.items():
        if course_number in combo_courses:
            # Return the level for the course combination
            return course_levels[combo_name]
    # Return the level for individual courses
    return course_levels.get(course_number)

# Apply the function to each row in df_cleaned to create a new 'course_level' column
df_cleaned['course_level'] = df_cleaned.apply(lambda row: determine_course_level(row['course_number'], row['campus_code']), axis=1)


# Removing the last 4 characters of each value in 'test_score' column
df_cleaned['test_score'] = df_cleaned['test_score'].astype(str).str[:-4]

# Make sure the column names match in both dataframes before merging
df_cleaned.rename(columns={'pidm': 'PIDM'}, inplace=True)

# Create a new column to store the PIDM, course number, and term as a single string
df_cleaned['PIDM_course_number_term'] = df_cleaned['PIDM'].astype(str) + '_' + df_cleaned['course_number'].astype(str) + '_' + df_cleaned['term'].astype(str)

# Remove duplicates based on the new column
df_cleaned = df_cleaned.drop_duplicates(subset='PIDM_course_number_term')

# Display the cleaned and imputed dataframe
df_cleaned.head()





Unnamed: 0.1,Unnamed: 0,PIDM,term_code,gid,admit_term,degree.x,major_code,college.x,major.x,stu_type,...,section_number,subj_code,section_type,credit_levl,course_credits,reg_status,grade,grade_category,course_level,PIDM_course_number_term
8,30,149852,202070,-110755,202070.0,Associate of Arts,AA,Gallatin College,Associate of Arts,T,...,922,M,L,UG,1,RE,A*,Successful,100.0,149852_063_202270
9,35,210982,202030,-171885,202030.0,Associate of Science,AS,Gallatin College,Associate of Science,N,...,2,STAT,L,UG,3,RW,B,Successful,300.0,210982_216Q_202170
11,39,324739,202130,-1053980,202130.0,Associate of Arts,AA,Gallatin College,Associate of Arts,T,...,2,M,L,UG,3,RW,A,Successful,150.0,324739_105Q_202270
13,43,372115,202170,-1097291,202170.0,Bachelor of Science,FNSC,College of Education/HHD,Food and Nutrition,T,...,7,M,L,UG,4,RW,A-,Successful,400.0,372115_161Q_202170
15,47,376132,202170,-1100963,202170.0,Bachelor of Science,PBAC,College of Business,Pre-Business,T,...,2,M,L,UG,4,W,W,Unsuccessful,400.0,376132_161Q_202270


In [3]:

# Convert 'term' to a numeric type to ensure correct sorting
df_cleaned['term'] = pd.to_numeric(df_cleaned['term'])

# Find the oldest term for each PIDM
oldest_terms = df_cleaned.groupby('PIDM')['term'].min().reset_index()

# Merge the oldest terms back to the original DataFrame to filter records
df_oldest = pd.merge(df_cleaned, oldest_terms, on=['PIDM', 'term'], how='inner')

# Now, df_oldest contains only the records of the oldest term for each PIDM,
# including cases where there are multiple records for a PIDM within that oldest term

# Sort PIDM
df_oldest.sort_values(by=['PIDM', 'term'], ascending=[True, True], inplace=True)

# Show the updated DataFrame
df_oldest.head()


Unnamed: 0.1,Unnamed: 0,PIDM,term_code,gid,admit_term,degree.x,major_code,college.x,major.x,stu_type,...,section_number,subj_code,section_type,credit_levl,course_credits,reg_status,grade,grade_category,course_level,PIDM_course_number_term
0,30,149852,202070,-110755,202070.0,Associate of Arts,AA,Gallatin College,Associate of Arts,T,...,922,M,L,UG,1,RE,A*,Successful,100.0,149852_063_202270
1,35,210982,202030,-171885,202030.0,Associate of Science,AS,Gallatin College,Associate of Science,N,...,2,STAT,L,UG,3,RW,B,Successful,300.0,210982_216Q_202170
2,39,324739,202130,-1053980,202130.0,Associate of Arts,AA,Gallatin College,Associate of Arts,T,...,2,M,L,UG,3,RW,A,Successful,150.0,324739_105Q_202270
3,43,372115,202170,-1097291,202170.0,Bachelor of Science,FNSC,College of Education/HHD,Food and Nutrition,T,...,7,M,L,UG,4,RW,A-,Successful,400.0,372115_161Q_202170
4,49,376132,202170,-1100963,202170.0,Bachelor of Science,PBAC,College of Business,Pre-Business,T,...,921,M,L,UG,2,RW,A*,Successful,100.0,376132_063_202230


In [4]:
# Trim whitespace from headers
df_oldest.columns = df_oldest.columns.str.strip()
df_ed_ready.columns = df_ed_ready.columns.str.strip()

# Merge the two dataframes based on the PIDM
df_merged = pd.merge(df_oldest, df_ed_ready[['PIDM', 'ERM_SCORE']], on='PIDM', how='left')

# Display the merged dataframe
df_merged.head()


Unnamed: 0.1,Unnamed: 0,PIDM,term_code,gid,admit_term,degree.x,major_code,college.x,major.x,stu_type,...,subj_code,section_type,credit_levl,course_credits,reg_status,grade,grade_category,course_level,PIDM_course_number_term,ERM_SCORE
0,30,149852,202070,-110755,202070.0,Associate of Arts,AA,Gallatin College,Associate of Arts,T,...,M,L,UG,1,RE,A*,Successful,100.0,149852_063_202270,10.0
1,35,210982,202030,-171885,202030.0,Associate of Science,AS,Gallatin College,Associate of Science,N,...,STAT,L,UG,3,RW,B,Successful,300.0,210982_216Q_202170,
2,39,324739,202130,-1053980,202130.0,Associate of Arts,AA,Gallatin College,Associate of Arts,T,...,M,L,UG,3,RW,A,Successful,150.0,324739_105Q_202270,30.0
3,43,372115,202170,-1097291,202170.0,Bachelor of Science,FNSC,College of Education/HHD,Food and Nutrition,T,...,M,L,UG,4,RW,A-,Successful,400.0,372115_161Q_202170,40.0
4,49,376132,202170,-1100963,202170.0,Bachelor of Science,PBAC,College of Business,Pre-Business,T,...,M,L,UG,2,RW,A*,Successful,100.0,376132_063_202230,10.0


In [5]:

# Define thresholds for single scores
erm_score_thresholds = {
    100: 10,
    150: 15,
    250: 25,
    290: 30,
    300: 30,
    400: 40,
    500: 50,
}

# Define thresholds for SAT and ACT as tuples (min, max)
sat_new_math_thresholds = {
    100: (None, 460),
    150: (460, 529),
    250: (530, 539),
    290: (540, 559),
    300: (560, 589),
    400: (590, 639),
    500: (640, 640),
}
act_math_thresholds = {
    100: (None, 17),
    150: (17, 20),
    250: (21, 21),
    290: (22, 22),
    300: (23, 24),
    400: (25, 26),
    500: (27, 27),
}

# Define dual thresholds for SAT new math and high school GPA
dual_sat_gpa_thresholds = {
    150: (380, 519, None, 3.0),
    250: (520, 529, 3.01, 3.5),
    290: (530, 530, 3.01, 3.5),
    300: (530, 559, 3.01, 3.5),
    400: (560, 580, 3.7, 4.5),
}

# Define dual thresholds for ACT math and high school GPA
dual_act_gpa_thresholds = {
    150: (15, 19, None, 3.0),
    250: (20, 20, 3.01, 3.5),
    290: (21, 21, 3.01, 3.5),
    300: (21, 22, 3.01, 3.5),
    400: (23, 24, 3.7, 4.5),
}
# The compliance checking function
def check_compliance(row):
    level = row['test_score']
    erm_score = row.get('ERM_SCORE', 0)
    sat_score = row.get('sat_new_math', 0)
    act_score = row.get('act_math', 0)
    gpa = row.get('hs_gpa', 0)
    
    # Check ERM compliance
    if erm_score < erm_score_thresholds.get(level, float('inf')):
        return 'Uncompliant'
    
    # SAT score compliance
    sat_range = sat_new_math_thresholds.get(level, (None, None))
    if sat_score < (sat_range[0] if sat_range[0] is not None else float('-inf')) or \
       sat_score > (sat_range[1] if sat_range[1] is not None else float('inf')):
        return 'Uncompliant'
    
    # ACT score compliance
    act_range = act_math_thresholds.get(level, (None, None))
    if act_score < (act_range[0] if act_range[0] is not None else float('-inf')) or \
       act_score > (act_range[1] if act_range[1] is not None else float('inf')):
        return 'Uncompliant'
    
    # Dual score compliance checks need to handle GPA and SAT/ACT together
    sat_gpa_range = dual_sat_gpa_thresholds.get(level, (None, None, None, None))
    act_gpa_range = dual_act_gpa_thresholds.get(level, (None, None, None, None))

    # SAT & GPA compliance
    if (sat_gpa_range[0] is None or sat_score >= sat_gpa_range[0]) and \
       (sat_gpa_range[1] is None or sat_score <= sat_gpa_range[1]) and \
       (sat_gpa_range[2] is None or gpa >= sat_gpa_range[2]) and \
       (sat_gpa_range[3] is None or gpa <= sat_gpa_range[3]):
        return 'Compliant'

    # ACT & GPA compliance
    if (act_gpa_range[0] is None or act_score >= act_gpa_range[0]) and \
       (act_gpa_range[1] is None or act_score <= act_gpa_range[1]) and \
       (act_gpa_range[2] is None or gpa >= act_gpa_range[2]) and \
       (act_gpa_range[3] is None or gpa <= act_gpa_range[3]):
        return 'Compliant'
    
    return 'Uncompliant'

# Apply the compliance checking function to each row of the DataFrame
df_merged['Compliance'] = df_merged.apply(check_compliance, axis=1)


# Output the DataFrame to check the compliance results
print(df_merged[['test_score', 'ERM_SCORE', 'sat_new_math', 'act_math', 'hs_gpa', 'Compliance']])


df_merged = df_merged[df_merged['Compliance'] == 'Compliant']


      test_score  ERM_SCORE  sat_new_math  act_math  hs_gpa   Compliance
0            100       10.0           NaN       NaN    1.72  Uncompliant
1            100        NaN           NaN       NaN    2.50    Compliant
2            300       30.0           NaN       NaN     NaN  Uncompliant
3            400       40.0           NaN       NaN     NaN  Uncompliant
4            100       10.0           NaN       NaN     NaN  Uncompliant
...          ...        ...           ...       ...     ...          ...
12353        300       30.0           NaN       NaN     NaN  Uncompliant
12354        300       30.0           NaN       NaN     NaN  Uncompliant
12355        400       40.0           NaN       NaN     NaN  Uncompliant
12356        300       30.0           NaN       NaN    3.00  Uncompliant
12357        250        NaN           NaN      20.0    4.30    Compliant

[12358 rows x 6 columns]


In [6]:
# Count the number of rows in the merged dataframe
num_rows = df_merged.shape[0]
print(f"Number of rows in df_merged: {num_rows}")


Number of rows in df_merged: 7359


In [7]:

# Count the occurrences of each grade category
grade_counts = df_merged['grade_category'].value_counts()

# Display the count of each grade category
print(grade_counts)


grade_category
Successful      4512
Unsuccessful    2811
Ignore            25
Name: count, dtype: int64


In [8]:

# Group by 'course_level' and then count occurrences of each 'grade_category' within each level
grade_category_distribution = df_merged.groupby('test_score')['grade_category'].value_counts().unstack(fill_value=0)


# Add a new column to store the ratio of successful students
grade_category_distribution['successful_ratio'] = grade_category_distribution['Successful'] / (grade_category_distribution['Ignore'] + grade_category_distribution['Successful'] + grade_category_distribution['Unsuccessful'])


# Display the distribution of grade categories within each course level
print(grade_category_distribution)


grade_category  Ignore  Successful  Unsuccessful  successful_ratio
test_score                                                        
                    15         302           252          0.530756
100                  1         140           177          0.440252
150                  5         612           640          0.486874
250                  0         201           142          0.586006
290                  0         111           108          0.506849
300                  3         723           585          0.551487
400                  1         997           497          0.666890
450                  0         185            35          0.840909
500                  0        1241           375          0.767946


In [9]:
# Group by 'course_level' and then count occurrences of each 'grade_category' within each level
grade_category_distribution = df_merged.groupby('course_number')['grade_category'].value_counts().unstack(fill_value=0)

grade_category_distribution['successful_ratio'] = grade_category_distribution['Successful'] / (grade_category_distribution['Ignore'] + grade_category_distribution['Successful'] + grade_category_distribution['Unsuccessful'])


# Display the distribution of grade categories within each course level
# Filter the grade_category_distribution for base courses and combos
filtered_distribution = grade_category_distribution.loc[base_courses + courses_to_check]

# Print the filtered grade_category_distribution
print(filtered_distribution)



grade_category  Ignore  Successful  Unsuccessful  successful_ratio
course_number                                                     
088                  0          31            27          0.534483
216Q                 0        1049           696          0.601146
132                  0          80            22          0.784314
161Q                 0         842           474          0.639818
151Q                 0         417           244          0.630862
165Q                 0          43            33          0.565789
171Q                 0         584           357          0.620616
105Q                14         219            99          0.659639
090                  0          31            39          0.442857
121Q                11         479           419          0.526953
105Q                14         219            99          0.659639
090                  0          31            39          0.442857
121Q                11         479           419          0.52

In [10]:
df_merged[['hs_gpa','act_math','sat_new_math', 'ERM_SCORE', 'test_score', 'grade_category']].describe()


Unnamed: 0,hs_gpa,act_math,sat_new_math,ERM_SCORE
count,6524.0,4443.0,2559.0,0.0
mean,3.512922,23.204366,583.442751,
std,0.498882,4.566395,73.820592,
min,0.0,12.0,310.0,
25%,3.21,20.0,540.0,
50%,3.65,24.0,580.0,
75%,3.94,26.0,630.0,
max,4.73,36.0,800.0,


### Model 1: Logistic Regression Model - Course Levels

In [11]:
# Define base courses and course combinations
base_courses = ['088', '216Q', '132', '161Q', '151Q', '165Q', '171Q']
course_combinations = {
    'Combo1': ('005', '105Q'),
    'Combo2': ('063', '090'),
    'Combo3': ('021', '121Q')
}

# Add '105Q', '090', '121Q' conditionally based on campus_code
courses_to_check = ['105Q', '090', '121Q']
for course in courses_to_check:
    if df_merged[(df_merged['campus_code'] != 'ZGC') & (df_merged['course_number'] == course)].shape[0] > 0:
        base_courses.append(course)

# Combine individual courses with combination labels for iteration
included_courses = base_courses + list(course_combinations.keys())

# Filtering DataFrame based on updated logic
def filter_df(df, course_key):
    # For combinations, select rows matching any of the combination courses
    if course_key in course_combinations:
        return df[df['course_number'].isin(course_combinations[course_key])]
    # For individual courses, simply filter by the course number
    else:
        return df[df['course_number'] == course_key]

# Define the combinations of X variables you want to explore
x_variable_combinations = [
    ['ERM_SCORE'],
    ['hs_gpa', 'ERM_SCORE'],
    ['sat_new_math'],
    ['act_math'],
    ['hs_gpa', 'sat_new_math'],
    ['hs_gpa', 'act_math']
]

# Initialize model storage with an additional level for variable combinations
models = {}

for course_key in included_courses:
    df_filtered = filter_df(df_merged, course_key)
    if df_filtered.empty:
        print(f"No data for {course_key}")
        continue

    # Iterate through each combination of X variables
    for x_vars in x_variable_combinations:
        X = df_filtered[x_vars].dropna()  # Ensure no NaN values in predictors
        y = df_filtered.loc[X.index, 'grade_category']
        
        # Drop rows with NaN in any of the specified 'X' variables
        y = df_filtered.loc[X.index, 'grade_category']
        
        # Ensure 'y' is aligned with 'X' after dropping NaNs
        X = X.dropna()
        
        
        if X.empty or y.isnull().any():
            print(f"Skipping {course_key} with variables {x_vars} due to NaN values.")
            continue

        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)

        # Check if y_encoded contains more than one unique class
        if len(np.unique(y_encoded)) < 2:
            print(f"Skipping {course_key} with variables {x_vars} because the data contains only one class after encoding.")
            continue

        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.20, random_state=42)

        log_reg = LogisticRegression(max_iter=1000)
        try:
            log_reg.fit(X_train, y_train)
        except ValueError as e:
            print(f"Error fitting model for {course_key} with variables {x_vars}: {e}")
            continue
        
        y_pred = log_reg.predict(X_test)

        unique_y_test = set(y_test)
        unique_y_pred = set(y_pred)
        unique_classes = unique_y_test.union(unique_y_pred)
        target_names = label_encoder.inverse_transform(list(unique_classes))

        print(f'Course Level: {course_key}, Variables: {x_vars}')
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
        print(classification_report(y_test, y_pred, labels=list(unique_classes), target_names=target_names, zero_division=1))
        print("--------------------\n")

        models[(course_key, tuple(x_vars))] = log_reg


Skipping 088 with variables ['ERM_SCORE'] due to NaN values.
Skipping 088 with variables ['hs_gpa', 'ERM_SCORE'] due to NaN values.
Error fitting model for 088 with variables ['sat_new_math']: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1
Course Level: 088, Variables: ['act_math']
Accuracy: 0.50
              precision    recall  f1-score   support

  Successful       0.56      0.83      0.67         6
Unsuccessful       0.00      0.00      1.00         4

    accuracy                           0.50        10
   macro avg       0.28      0.42      0.83        10
weighted avg       0.33      0.50      0.80        10

--------------------

Error fitting model for 088 with variables ['hs_gpa', 'sat_new_math']: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1
Course Level: 088, Variables: ['hs_gpa', 'act_math']
Accuracy: 0.89
              precision    recall  f1-score   support

  Su

### Model 2: Logistic Regression Model - Courses/Combos

In [12]:
# Define the combinations of X variables you want to explore
x_variable_combinations = [
    ['ERM_SCORE'],
    ['hs_gpa','ERM_SCORE'],
    ['sat_new_math'],
    ['act_math'],
    ['hs_gpa', 'sat_new_math'],
    ['hs_gpa', 'act_math']
]

# Initialize model storage with an additional level for variable combinations
models = {}

for course_key in included_courses:
    df_filtered = filter_df(df_merged, course_key)
    if df_filtered.empty:
        print(f"No data for course number {course_key}")
        continue

    # Iterate through each combination of X variables
    for x_vars in x_variable_combinations:
        # Attempt to select specified variables; handle missing variables by skipping
        try:
            X = df_filtered[x_vars]
        except KeyError:
            print(f"Skipping {course_key} due to missing one of the variables: {x_vars}")
            continue

        # Drop rows with NaN in any of the specified 'X' variables
        X = X.dropna(how='any')
        
        # Ensure 'y' is aligned with 'X' after dropping NaNs
        y = df_filtered.loc[X.index, 'grade_category'].dropna()
        
        # Ensure the same indices are used for both X and y after dropna to avoid issues in train_test_split
        X = X.loc[y.index]

        if X.empty or y.empty:
            print(f"Skipping {course_key} with variables {x_vars} due to insufficient data after dropping NaNs.")
            continue

        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)

        # Check if y_encoded contains more than one unique class
        if len(np.unique(y_encoded)) < 2:
            print(f"Skipping {course_key} with variables {x_vars} because the data contains only one class after encoding.")
            continue

        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.20, random_state=42)

        log_reg = LogisticRegression(max_iter=1000)
        try:
            log_reg.fit(X_train, y_train)
        except ValueError as e:
            print(f"Error fitting model for {course_key} with variables {x_vars}: {e}")
            continue

        y_pred = log_reg.predict(X_test)

        unique_y_test = set(y_test)
        unique_y_pred = set(y_pred)
        unique_classes = unique_y_test.union(unique_y_pred)
        target_names = label_encoder.inverse_transform(list(unique_classes))

        print(f'Course/Combo: {course_key}, Variables: {x_vars}')
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
        print(classification_report(y_test, y_pred, labels=list(unique_classes), target_names=target_names, zero_division=1))
        print("--------------------\n")

        # Store the trained model with variable combination as part of the key
        models[(course_key, tuple(x_vars))] = log_reg



Skipping 088 with variables ['ERM_SCORE'] due to insufficient data after dropping NaNs.
Skipping 088 with variables ['hs_gpa', 'ERM_SCORE'] due to insufficient data after dropping NaNs.
Error fitting model for 088 with variables ['sat_new_math']: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1
Course/Combo: 088, Variables: ['act_math']
Accuracy: 0.50
              precision    recall  f1-score   support

  Successful       0.56      0.83      0.67         6
Unsuccessful       0.00      0.00      1.00         4

    accuracy                           0.50        10
   macro avg       0.28      0.42      0.83        10
weighted avg       0.33      0.50      0.80        10

--------------------

Error fitting model for 088 with variables ['hs_gpa', 'sat_new_math']: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1
Course/Combo: 088, Variables: ['hs_gpa', 'act_math']
Accuracy: 0.89
     

## Model 3: Random Forest

In [13]:
# Now prepare your predictors (X) and target (y) using the cleaned DataFrame
X = df_merged[['hs_gpa', 'ERM_SCORE']]
y = df_merged['course_level']

# Ensure no NaN values in predictors
X = X.dropna()

# Training and testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a decision tree classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict the target
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))



ValueError: Found input variables with inconsistent numbers of samples: [0, 7359]

## Model 4:  Decision Tree

In [None]:

# Encode the target variable 'course_number'
course_number_encoder = LabelEncoder()
df_merged['encoded_course_level'] = course_number_encoder.fit_transform(df_merged['course_level'])

# Predictors
X = df_merged[['hs_gpa', 'ERM_SCORE']]

# Target variable is'course_number_encoded'
y_course_number = df_merged['encoded_course_level']

# Ensure no NaN values in predictors
X = X.dropna()


# Split the data into training and testing sets
X_train, X_test, y_train_course_number, y_test_course_number = train_test_split(X, y_course_number, test_size=0.3, random_state=42)

# Initialize the DecisionTreeClassifier with parameters to control tree complexity
clf_course_number = DecisionTreeClassifier(
    max_depth=5,               # Limit the depth of the tree
    min_samples_split=40,      # Require at least 40 samples to split a node
    min_samples_leaf=20,       # Each leaf node must contain at least 20 samples
    max_leaf_nodes=15,         # Limit the total number of leaf nodes
    random_state=42
)
# Fit the classifier to the training data
clf_course_number.fit(X_train, y_train_course_number)

# Predict on the test data
y_pred_course_number = clf_course_number.predict(X_test)


In [None]:
# Decode the predicted course numbers back to the original encoding
predicted_courses = course_number_encoder.inverse_transform(y_pred_course_number)

# Add the predicted courses to test DataFrame
X_test.loc[:,'test_score'] = predicted_courses

# Join the original 'grade_category' to the test DataFrame for evaluation
X_test = X_test.join(df_merged['grade_category'], how='left')

# Evaluate the success by checking the grade_category of the predicted placements
successful_placements = X_test[X_test['grade_category'] == 'Successful']
success_rate = len(successful_placements) / len(X_test)
print(f'Success rate of placements: {success_rate:.2f}')


In [None]:
# Visualize the decision tree
plt.figure(figsize=(40,10))
plot_tree(clf_course_number, 
          feature_names=X_train.columns, 
          class_names=course_number_encoder.classes_.astype(str),  # Ensure class names are string
          filled=True, rounded=True, 
          fontsize=12)


# Display the figure
plt.show()

## Model 5: XG Boost

In [None]:

#Encode the target variable
course_number_encoder = LabelEncoder()
df_merged['course_level_encoded'] = course_number_encoder.fit_transform(df_merged['test_score'])

#Prepare the data
X = df_merged[['ERM_SCORE', 'hs_gpa']]
y = df_merged['course_level_encoded'] 

# Ensure no NaN values in predictors
X = X.dropna()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Continue with initializing and fitting your XGBoost classifier
xgb_model_course_number = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model_course_number.fit(X_train, y_train)

# Predict on the test data
y_pred_course_number = xgb_model_course_number.predict(X_test)

# Decode the predicted course numbers back to the original course numbers for interpretability
predicted_course_numbers = course_number_encoder.inverse_transform(y_pred_course_number)

# Evaluate the predictions
accuracy = accuracy_score(y_test, y_pred_course_number)  # This comparison should now be valid
print(f'Accuracy of the XGBoost model for course number prediction: {accuracy:.2f}')


In [None]:
# Convert the unique encoded labels back to original string labels
# Ensure original_class_labels are strings
original_class_labels_str = [str(label) for label in course_number_encoder.inverse_transform(df_merged['course_level_encoded'].unique())]

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_course_number, target_names=original_class_labels_str))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_course_number)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=course_number_encoder.classes_, yticklabels=course_number_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()