In [4]:
# imports

import torch
import pandas as pd
import matplotlib.pyplot as plt

from utils import ProjectFolders

In [5]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

# Load the dataset
df_final = pd.read_csv(ProjectFolders.FINAL_DATASET_FILE)

# Define labels and features
DECILE_LABEL = 'decile_score'
TWO_YEAR_REC_LABEL = 'two_year_recid'

# Scale the decile score to [0, 1] and extract labels and features
y = df_final[DECILE_LABEL].values
y_two_year_recid = df_final[TWO_YEAR_REC_LABEL].values  # Binary label
X = df_final.drop(columns=[DECILE_LABEL, TWO_YEAR_REC_LABEL]).values  # Features

# First, split into train+validation and test datasets
X_train_val, X_test, y_train_val, y_test, y_two_year_recid_train_val, y_two_year_recid_test = train_test_split(
    X, y, y_two_year_recid, test_size=0.2, random_state=42
)

# Then, split train+validation into train and validation datasets
X_train, X_val, y_train, y_val, y_two_year_recid_train, y_two_year_recid_val = train_test_split(
    X_train_val, y_train_val, y_two_year_recid_train_val, test_size=0.25, random_state=42
)
# # Note: 0.25 of the train+validation set = 0.2 of the total dataset, resulting in 60/20/20 split
# # Convert datasets to PyTorch tensors
# X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
# X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
# y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
# X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# # Convert the binary labels (two_year_recid) to tensors
# y_two_year_recid_train_tensor = torch.tensor(y_two_year_recid_train, dtype=torch.float32)
# y_two_year_recid_val_tensor = torch.tensor(y_two_year_recid_val, dtype=torch.float32)
# y_two_year_recid_test_tensor = torch.tensor(y_two_year_recid_test, dtype=torch.float32)

# # Print dataset sizes for verification
# print(f"Train set: {X_train_tensor.shape}, {y_train_tensor.shape}")
# print(f"Validation set: {X_val_tensor.shape}, {y_val_tensor.shape}")
# print(f"Test set: {X_test_tensor.shape}, {y_test_tensor.shape}")


In [6]:
from utils import compare_score


compare_score(y_train_tensor, y_two_year_recid_train_tensor)




TN: 1795
FP: 540
FN: 941
TP: 1052
Senstivity: 0.5278474661314602
Specificity: 0.7687366167023555
Precision: 0.6608040201005025
Accuracy: 0.6578096118299446


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score



# Initialize KNN Regressor
k = 10 # Choose the number of neighbors
knn = KNeighborsRegressor(n_neighbors=k)

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)


from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': range(1, 21)}
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best k:", grid_search.best_params_['n_neighbors'])
print("Best cross-validated MSE:", -grid_search.best_score_)



Mean Squared Error (MSE): 5.3094802494802495
R-squared (R2): 0.3769249115066393
Best k: 19
Best cross-validated MSE: 4.883754652782956


In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score



# Initialize and train the decision tree regressor
tree_reg = DecisionTreeRegressor(max_depth=5, random_state=42)
tree_reg.fit(X_train, y_train)

# Make predictions
y_pred = tree_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 5.103915162245261
R-squared (R2): 0.4010482680126972


In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report



# Initialize and train the decision tree classifier
tree_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
tree_clf.fit(X_train, y_train)

# Make predictions
y_pred = tree_clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)


Accuracy: 0.2591822591822592
Classification Report:
               precision    recall  f1-score   support

           1       0.42      0.85      0.57       291
           2       0.15      0.16      0.16       197
           3       0.17      0.02      0.03       167
           4       0.11      0.19      0.14       119
           5       0.00      0.00      0.00       145
           6       0.15      0.24      0.18       125
           7       0.15      0.19      0.17       106
           8       0.15      0.07      0.09        91
           9       0.17      0.02      0.03       111
          10       0.26      0.12      0.17        91

    accuracy                           0.26      1443
   macro avg       0.17      0.19      0.15      1443
weighted avg       0.20      0.26      0.20      1443



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters: {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2}
