<a href="https://colab.research.google.com/github/ajeunel/ajeunel/blob/main/LogisticRegressionHW9_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Cleaning dataset

In [1]:
## Cleaning the dataset to only include demographic factors, housing destinantion and total score.

import pandas as pd

# 1) Read raw data
df = pd.read_csv("SPDAT_Dataset.csv")

# 1.1) Rename 'intervention' → 'housing_destination'
df.rename(columns={'Intervention': 'Housing_destination'}, inplace=True)

# 2) Filter only Male/Female
df = df[df['Gender'].isin(['Male','Female'])]

# 3) Filter to wanted races
only_races = ['White','Black','Latino','Asian']
df = df[df['Race/Ethnicity'].isin(only_races)]

df = df.dropna(subset=['Age_group_at_assessment'])

remove_other_age_groups = ['Under 18']
df = df[~df['Age_group_at_assessment'].isin(remove_other_age_groups)]

# 5) Select only the columns you need
df = df[
    ['Unique_client_ID',
     'Race/Ethnicity',
     'Gender',
     'Age_group_at_assessment',
     'TOTAL_SCORE',
     'Housing_destination']
]

# 6) Reset index
df.reset_index(drop=True, inplace=True)

# 7) Export cleaned CSV
df.to_csv('vispdat_cleaned.csv', index=False)



## DATA PREPARATION/ ADDING DUMMY DATA


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1) Load and prepare
df = pd.read_csv("vispdat_cleaned.csv")
df['is_perm_housed'] = df['Housing_destination'].isin(["Permanently Housed"]).astype(int)
df["Race/Ethnicity"] = pd.Categorical(
    df["Race/Ethnicity"],
    categories=["White","Black","Latino","Asian"],
    ordered=False
)
df["Age_group_at_assessment"] = pd.Categorical(
    df["Age_group_at_assessment"],
    categories=["18-24", "26-44", "45-59", "60+"],
    ordered=False
)

# 2) Define features/target
X = df[['TOTAL_SCORE','Race/Ethnicity','Gender','Age_group_at_assessment']]
y = df['is_perm_housed']

# 3) Creating dummy data. This is lengthy but I ran into issues with encoding dropping the first catagory
#.. and not allowing me to specify the reference groups (see paper for explanation)
df['Gender_Female'] = (df['Gender'] == 'Female').astype(int)

# 3) Create race dummies, omitting 'White' as the reference
df['Race_Black']    = (df['Race/Ethnicity'] == 'Black').astype(int)
df['Race_Latino'] = (df['Race/Ethnicity'] == 'Latino').astype(int)
df['Race_Asian']    = (df['Race/Ethnicity'] == 'Asian').astype(int)

#Age Dummy data
df['Age_Youth'] = (df['Age_group_at_assessment'] == '18-24').astype(int)
df['Age_GenX'] = (df['Age_group_at_assessment'] == '45-59').astype(int)
df['Age_Boomer'] = (df['Age_group_at_assessment'] == '60+').astype(int)


# 4) Now df has four new columns: Gender_Male, Race_Black, Race_Hispanic, Race_Asian

print(df[['Gender_Female', 'Race_Black', 'Race_Latino', 'Race_Asian','Age_Youth','Age_GenX','Age_Boomer']].head())
df.to_csv('SPDAT_scores_with_dummy.csv', index=False)


   Gender_Female  Race_Black  Race_Latino  Race_Asian  Age_Youth  Age_GenX  \
0              1           0            1           0          0         0   
1              0           0            1           0          0         0   
2              0           0            0           0          0         1   
3              0           1            0           0          0         1   
4              1           1            0           0          0         0   

   Age_Boomer  
0           0  
1           0  
2           0  
3           0  
4           1  


In [3]:
# 4) Fit & inspect coefficients
import pandas as pd
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

# 1) Load your filtered, dummy-augmented DataFrame
df = df.drop(columns=["Gender", "Race/Ethnicity","Age_group_at_assessment"])

# 3) Define X & y
features = [
    "Gender_Female",
    "Race_Black",
    "Race_Latino",
    "Race_Asian",
    "TOTAL_SCORE",
    "Age_Youth",
    "Age_GenX",
    "Age_Boomer"
]
X = df[features]
y = df['Housing_destination'].isin(["Permanently Housed"]).astype(int)

# 4a) Quick sklearn fit
clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

print("Sklearn coefficients:")
for feat, coef in zip(features, clf.coef_[0]):
    print(f"  {feat}: {coef:.3f}")

# 4b) Or for a statsmodels summary
X2 = sm.add_constant(X)   # adds intercept term
logit = sm.Logit(y, X2).fit(disp=False)
print(logit.summary())



Sklearn coefficients:
  Gender_Female: 0.133
  Race_Black: 0.315
  Race_Latino: -0.074
  Race_Asian: 0.012
  TOTAL_SCORE: 0.006
  Age_Youth: 0.414
  Age_GenX: 0.384
  Age_Boomer: 0.597
                            Logit Regression Results                           
Dep. Variable:     Housing_destination   No. Observations:               129294
Model:                           Logit   Df Residuals:                   129285
Method:                            MLE   Df Model:                            8
Date:                 Mon, 28 Apr 2025   Pseudo R-squ.:                 0.01471
Time:                         11:36:07   Log-Likelihood:                -65571.
converged:                        True   LL-Null:                       -66550.
Covariance Type:             nonrobust   LLR p-value:                     0.000
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const      

## Logistic Regression

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model     import LogisticRegression
from sklearn.metrics          import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot       as plt

# a) Split out a test set - 80/20 split
X = df[["Gender_Female","Race_Black","Race_Latino","Race_Asian","Age_Youth","Age_GenX","Age_Boomer","TOTAL_SCORE"]]
y = df['Housing_destination'].isin(["Permanently Housed"]).astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# b) Fit logistic regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# c) Predict and get probabilities
y_pred     = clf.predict(X_test)
y_proba    = clf.predict_proba(X_test)[:,1]

# d) Confusion matrix & classification report
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm, "\n")
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))

# e) ROC AUC
auc = roc_auc_score(y_test, y_proba)
print(f"Test AUC: {auc:.3f}")




Confusion matrix:
 [[20414     0]
 [ 5445     0]] 

                        precision    recall  f1-score   support

Not Permanently Housed       0.79      1.00      0.88     20414
    Permanently Housed       0.00      0.00      0.00      5445

              accuracy                           0.79     25859
             macro avg       0.39      0.50      0.44     25859
          weighted avg       0.62      0.79      0.70     25859

Test AUC: 0.586


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Module 9 Homework: KNN and SVM

In [5]:
##KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Use stanard scaler
scaler = StandardScaler()

# Fit the scaler
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_proba_knn = knn.predict_proba(X_test_scaled)[:, 1]
print("KNN AUC:", roc_auc_score(y_test, y_proba_knn))

# d) Confusion matrix & classification report
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))


KNN AUC: 0.533759102105246
                        precision    recall  f1-score   support

Not Permanently Housed       0.79      1.00      0.88     20414
    Permanently Housed       0.00      0.00      0.00      5445

              accuracy                           0.79     25859
             macro avg       0.39      0.50      0.44     25859
          weighted avg       0.62      0.79      0.70     25859



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score

# 1) Instantiate a linear SVM (no kernels → much faster)
svm = LinearSVC(C=1.0, max_iter=10_000)

# 2) Fit on your scaled training data
svm.fit(X_train_scaled, y_train)

# 3) Score via decision function (for AUC)
y_scores = svm.decision_function(X_test_scaled)

# 4) Compute AUC
print("Linear SVM AUC:", roc_auc_score(y_test, y_scores))
# d) Confusion matrix & classification report
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))


Linear SVM AUC: 0.5860564190854455
                        precision    recall  f1-score   support

Not Permanently Housed       0.83      0.57      0.68     20414
    Permanently Housed       0.26      0.57      0.36      5445

              accuracy                           0.57     25859
             macro avg       0.55      0.57      0.52     25859
          weighted avg       0.71      0.57      0.61     25859



#Module 10 Homework: Tree Based Models

In [7]:
#decision tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_proba_dt = dt.predict_proba(X_test)[:, 1]   # use un-scaled for trees
print("Decision Tree AUC:", roc_auc_score(y_test, y_proba_dt))

# Confusion matrix and report
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))


Decision Tree AUC: 0.5920721820483126
                        precision    recall  f1-score   support

Not Permanently Housed       0.79      1.00      0.88     20414
    Permanently Housed       0.00      0.00      0.00      5445

              accuracy                           0.79     25859
             macro avg       0.39      0.50      0.44     25859
          weighted avg       0.62      0.79      0.70     25859



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
from sklearn.ensemble      import RandomForestClassifier
from sklearn.metrics       import confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit a Random Forest with class weights
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight="balanced",
    random_state=42
)
rf.fit(X_train, y_train)

y_pred  = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]

print("RF AUC:",   roc_auc_score(y_test, y_proba))
# Confusion matrix and report
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))

RF AUC: 0.5916580502604354
                        precision    recall  f1-score   support

Not Permanently Housed       0.83      0.57      0.68     20414
    Permanently Housed       0.26      0.57      0.36      5445

              accuracy                           0.57     25859
             macro avg       0.55      0.57      0.52     25859
          weighted avg       0.71      0.57      0.61     25859



## Module 11 Homework: ADABoost, GradientBoost,XBBoost

In [13]:
#ada boost
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier(n_estimators=100, random_state=42)
ab.fit(X_train, y_train)

y_proba_ab = ab.predict_proba(X_test)[:, 1]
print("AdaBoost AUC:", roc_auc_score(y_test, y_proba_ab))

# Confusion matrix and report
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))

AdaBoost AUC: 0.5909572852063298
                        precision    recall  f1-score   support

Not Permanently Housed       0.83      0.57      0.68     20414
    Permanently Housed       0.26      0.57      0.36      5445

              accuracy                           0.57     25859
             macro avg       0.55      0.57      0.52     25859
          weighted avg       0.71      0.57      0.61     25859



In [14]:
#gradient boost
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

y_proba_gb = gb.predict_proba(X_test)[:, 1]
print("Gradient Boosting AUC:", roc_auc_score(y_test, y_proba_gb))

# Confusion matrix and report
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))


Gradient Boosting AUC: 0.59581994315466
                        precision    recall  f1-score   support

Not Permanently Housed       0.83      0.57      0.68     20414
    Permanently Housed       0.26      0.57      0.36      5445

              accuracy                           0.57     25859
             macro avg       0.55      0.57      0.52     25859
          weighted avg       0.71      0.57      0.61     25859



In [15]:
##xg boost
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

y_proba_xgb = xgb.predict_proba(X_test)[:, 1]
print("XGBoost AUC:", roc_auc_score(y_test, y_proba_xgb))

# Confusion matrix and report
print(classification_report(y_test, y_pred,
      target_names=["Not Permanently Housed","Permanently Housed"]))

Parameters: { "use_label_encoder" } are not used.



XGBoost AUC: 0.5937668948811035
                        precision    recall  f1-score   support

Not Permanently Housed       0.83      0.57      0.68     20414
    Permanently Housed       0.26      0.57      0.36      5445

              accuracy                           0.57     25859
             macro avg       0.55      0.57      0.52     25859
          weighted avg       0.71      0.57      0.61     25859

