In [73]:
import pandas as pd

# Load the data into a DataFrame
data = "data/student_data.csv"
df = pd.read_csv(data)

df['Graduate'] = df['Target'].apply(lambda x: 1 if x == 'Graduate' else 0)
df['Enrolled'] = df['Target'].apply(lambda x: 1 if x == 'Enrolled' else 0)
df['Dropout'] = df['Target'].apply(lambda x: 1 if x == 'Dropout' else 0)
df = df.drop('Enrolled', axis=1)
df = df.drop('Target', axis=1)

# TODO: Remove all rows with "Enrolled" in Target column

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Marital status'])

# Split the data into features and target variables
targets = df[['Graduate', 'Dropout']]
student_train = df.drop(['Graduate', 'Dropout'], axis=1)

# Normalize the feature variables using Z-score normalization
student_train = (student_train - student_train.mean()) / student_train.std()

# Save the preprocessed data to a file
student_train.to_csv('output/student_train_preprocessed.csv', index=False)
targets.to_csv('output/targets.csv', index=False)


In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Load the preprocessed data
student_train = pd.read_csv('output/student_train_preprocessed.csv')
targets = pd.read_csv('output/targets.csv')

# Split the data into training and test sets
X_train, X_test, y_train_grad, y_test_grad, y_train_drop, y_test_drop = train_test_split(student_train, targets['Graduate'], targets['Dropout'], test_size=0.2, random_state=42)

# Train Logistic Regression model for 'Graduate'
logreg_model_grad = LogisticRegression()
logreg_model_grad.fit(X_train, y_train_grad)

# Train Decision Tree model for 'Graduate'
dt_model_grad = DecisionTreeClassifier()
dt_model_grad.fit(X_train, y_train_grad)

# Train Random Forest model for 'Graduate'
rf_model_grad = RandomForestClassifier()
rf_model_grad.fit(X_train, y_train_grad)

# Train Logistic Regression model for 'Dropout'
logreg_model_drop = LogisticRegression()
logreg_model_drop.fit(X_train, y_train_drop)

# Train Decision Tree model for 'Dropout'
dt_model_drop = DecisionTreeClassifier()
dt_model_drop.fit(X_train, y_train_drop)

# Train Random Forest model for 'Dropout'
rf_model_drop = RandomForestClassifier()
rf_model_drop.fit(X_train, y_train_drop)

# Evaluate the models for 'Graduate'
logreg_score_grad = logreg_model_grad.score(X_test, y_test_grad)
dt_score_grad = dt_model_grad.score(X_test, y_test_grad)
rf_score_grad = rf_model_grad.score(X_test, y_test_grad)

# Evaluate the models for 'Dropout'
logreg_score_drop = logreg_model_drop.score(X_test, y_test_drop)
dt_score_drop = dt_model_drop.score(X_test, y_test_drop)
rf_score_drop = rf_model_drop.score(X_test, y_test_drop)

# Print the accuracy scores for 'Graduate'
print("Logistic Regression Accuracy (Graduate):", logreg_score_grad)
print("Decision Tree Accuracy (Graduate):", dt_score_grad)
print("Random Forest Accuracy (Graduate):", rf_score_grad)

# Print the accuracy scores for 'Dropout'
print("Logistic Regression Accuracy (Dropout):", logreg_score_drop)
print("Decision Tree Accuracy (Dropout):", dt_score_drop)
print("Random Forest Accuracy (Dropout):", rf_score_drop)


Logistic Regression Accuracy (Graduate): 0.86
Decision Tree Accuracy (Graduate): 0.8125
Random Forest Accuracy (Graduate): 0.85625
Logistic Regression Accuracy (Dropout): 0.8825
Decision Tree Accuracy (Dropout): 0.8
Random Forest Accuracy (Dropout): 0.87


In [75]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

kf = KFold(n_splits=4, shuffle=True)
scores = cross_val_score(logreg_model_grad, X_train, y_train_grad, cv=kf, scoring='accuracy')

for i, score in enumerate(scores):
    print(f"{i}: {score}")

print(f"Mean: {scores.mean()}")
print(f"std: {np.std(scores)}")

0: 0.85875
1: 0.84
2: 0.8325
3: 0.83375
Mean: 0.84125
std: 0.01049553476484167


In [76]:
from sklearn.metrics import confusion_matrix

y_pred = logreg_model_grad.predict(X_test);
confusion_matrix(y_true=y_test_grad, y_pred=y_pred)

array([[345,  57],
       [ 55, 343]], dtype=int64)