In [38]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import folium
from sklearn.neighbors import BallTree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [4]:
# Upload the data file
df = pd.read_csv("df_cleaned.csv")

# Remove homes with unknown service line material
df = df[df["System-Owned Portion Service Line Material Classification "] != "MU"]

# Create binary classification for lead or non-lead pipes
df["is_lead"] = (df["System-Owned Portion Service Line Material Classification "]=="PB").astype(int)

  df = pd.read_csv("df_cleaned.csv")


In [15]:
# Logistic regression on the data set 

# Features and target
X = df[['YEARBLT']]  # Make sure it's a 2D array
y = df['is_lead']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit model
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.87      0.93     36936
           1       0.38      0.89      0.54      3340

    accuracy                           0.87     40276
   macro avg       0.69      0.88      0.73     40276
weighted avg       0.94      0.87      0.89     40276



In [None]:
# comprare to random guess
lead_counts_city = df["System-Owned Portion Service Line Material Classification "].eq('PB').sum()
nonlead_counts_city = df["System-Owned Portion Service Line Material Classification "].isin(['CU','OT','PL']).sum()
gal_counts_city = df["System-Owned Portion Service Line Material Classification "].eq('GAL').sum()
p_lead = lead_counts_city / (lead_counts_city+nonlead_counts_city+gal_counts_city)

# Create table of random guess stats
TP_guess = p_lead**2
FP_guess = p_lead*(1-p_lead)
FN_guess = (1-p_lead)*p_lead
TN_guess = (1-p_lead)**2

accuracy_guess = p_lead**2 + (1-p_lead)**2
precision_guess = p_lead
recall_guess = p_lead
specificity_guess = TN_guess/(TN_guess+FP_guess)

print(f"Lead pipes prevalence: \t \t {p_lead*100:.2f}%")
print(f"Non-lead pipes prevalence: \t {(1-p_lead)*100:.2f}%")
print(f"Random Guess Accuracy: \t \t {accuracy_guess*100:.2f}%")


Lead pipes prevalence: 	 	 8.56%
Non-lead pipes prevalence: 	 91.44%
Random Guess Accuracy: 	 	 84.35%


In [None]:
# Features and target
X = df[['YEARBLT']]
y = df['is_lead']

# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store metrics
accuracies = []
precisions = []
recalls = []
f1s = []
aucs = []

# Perform cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # normalize/scale YEARBLT
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize and fit logistic regression model
    model = LogisticRegression(class_weight='balanced')  # Useful if target is imbalanced
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Evaluate
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_proba))

# Print average results
print("Average Metrics:")
print(f"Accuracy:  {np.mean(accuracies):.3f} ({np.mean(accuracies)-accuracy_guess:.3f})")
print(f"Precision: {np.mean(precisions):.3f} ({np.mean(precisions)-precision_guess:.3f})")
print(f"Recall:    {np.mean(recalls):.3f} ({np.mean(recalls)-recall_guess:.3f})")
#print(f"F1 Score:  {np.mean(f1s):.3f} ({np.mean(f1s)-accuracy_guess):.3f})")
#print(f"ROC AUC:   {np.mean(aucs):.3f} ({np.mean(accuracies-accuracy_guess):.3f})")

Average Metrics:
Accuracy:  0.873 (0.030)
Precision: 0.394 (0.308)
Recall:    0.887 (0.801)
