# Pulling in both Children with lead in their blood and Schools with lead in the plumbing 
In this notebook I first read in the data, dropped columns that did not have testable data in them, then split the data into Training and testing sets to check the accuracy of the following question:
    Does the Zip Code and location play a factor in identifying whether or not a child could have lead in their blood? 

In [49]:
# Dependencies
import numpy as np
import pandas as pd
import datetime as dt
import requests
import json
from pprint import pprint
from matplotlib import style
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

In [50]:
#  Import and read the combined csv
df=pd.read_csv(Path('../brittany_analysis/data/combined_testing_data.csv'))
df.head()

Unnamed: 0,County,Zip,Year of Birth,Tests,Less than 5 mcg/dL,5-10 mcg/dL,10-15 mcg/dL,15+ mcg/dL,Total Elevated Blood Levels,"Rate per 1,000",...,County Location_x,School,Type of Organization,Any Building with Lead-Free Plumbing?,Number of Outlets that Require Sampling,Results ≤ 15 ppb,Results ≤ 15 ppb.1,Results ≤ 15 ppb.2,Results ≤ 15 ppb.3,County Location_y
0,Albany,12009,2019,30,30.0,0.0,0.0,0.0,0.0,0.0,...,"(42.588271, -73.974014)",ALTAMONT ES,Public School,No,63.0,63.0,0.0,63.0,0.0,"(42.678066, -73.814233)"
1,Albany,12084,2018,30,30.0,0.0,0.0,0.0,0.0,0.0,...,"(42.588271, -73.974014)",GUILDERLAND ES,Public School,No,95.0,95.0,0.0,95.0,0.0,"(42.678066, -73.814233)"
2,Albany,12084,2018,30,30.0,0.0,0.0,0.0,0.0,0.0,...,"(42.588271, -73.974014)",GUILDERLAND MS,Public School,No,218.0,218.0,0.0,218.0,0.0,"(42.678066, -73.814233)"
3,Albany,12110,2017,10,10.0,0.0,0.0,0.0,0.0,0.0,...,"(42.588271, -73.974014)",BLUE CREEK SCHOOL,Public School,No,82.0,81.0,1.0,81.0,1.0,"(42.678066, -73.814233)"
4,Albany,12110,2017,10,10.0,0.0,0.0,0.0,0.0,0.0,...,"(42.588271, -73.974014)",FORTS FERRY SCHOOL,Public School,No,82.0,81.0,1.0,81.0,1.0,"(42.678066, -73.814233)"


In [51]:
#Select your features (columns)
df.columns

Index(['County', 'Zip', 'Year of Birth', 'Tests', 'Less than 5 mcg/dL',
       '5-10 mcg/dL', '10-15 mcg/dL', '15+ mcg/dL',
       'Total Elevated Blood Levels', 'Rate per 1,000', 'Percent',
       'Zip Code Location', 'County Location_x', 'School',
       'Type of Organization', 'Any Building with Lead-Free Plumbing?',
       'Number of Outlets that Require Sampling', 'Results ≤ 15 ppb',
       'Results ≤ 15 ppb.1', 'Results ≤ 15 ppb.2', 'Results ≤ 15 ppb.3',
       'County Location_y'],
      dtype='object')

In [52]:
#Drop specific columns that are not needed 
df=df.drop(columns=['County','County Location_x', 'Zip Code Location','Type of Organization','School','County Location_y','Any Building with Lead-Free Plumbing?'])

In [53]:
df

Unnamed: 0,Zip,Year of Birth,Tests,Less than 5 mcg/dL,5-10 mcg/dL,10-15 mcg/dL,15+ mcg/dL,Total Elevated Blood Levels,"Rate per 1,000",Percent,Number of Outlets that Require Sampling,Results ≤ 15 ppb,Results ≤ 15 ppb.1,Results ≤ 15 ppb.2,Results ≤ 15 ppb.3
0,12009,2019,30,30.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,63.0,0.0,63.0,0.0
1,12084,2018,30,30.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0,95.0,0.0,95.0,0.0
2,12084,2018,30,30.0,0.0,0.0,0.0,0.0,0.0,0.0,218.0,218.0,0.0,218.0,0.0
3,12110,2017,10,10.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,81.0,1.0,81.0,1.0
4,12110,2017,10,10.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,81.0,1.0,81.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4711,10805,2016,17,17.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,57.0,8.0,57.0,8.0
4712,10805,2016,17,17.0,0.0,0.0,0.0,0.0,0.0,0.0,92.0,79.0,13.0,79.0,13.0
4713,14011,2018,28,28.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,17.0,1.0,17.0,1.0
4714,14527,2017,13,13.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,43.0,1.0,43.0,1.0


In [54]:
df.columns

Index(['Zip', 'Year of Birth', 'Tests', 'Less than 5 mcg/dL', '5-10 mcg/dL',
       '10-15 mcg/dL', '15+ mcg/dL', 'Total Elevated Blood Levels',
       'Rate per 1,000', 'Percent', 'Number of Outlets that Require Sampling',
       'Results ≤ 15 ppb', 'Results ≤ 15 ppb.1', 'Results ≤ 15 ppb.2',
       'Results ≤ 15 ppb.3'],
      dtype='object')

# Split the Data into Training and Testing Sets

In [55]:
#Define the value of X and y
y=df['Zip']

X=df.drop(columns='Zip')


print(X.shape,y.shape )

(4716, 14) (4716,)


In [56]:
#Check the balance of our target values
y.value_counts()

11717    96
11510    66
11746    60
11758    60
11791    60
         ..
13030     1
14543     1
11941     1
14613     1
12009     1
Name: Zip, Length: 573, dtype: int64

In [57]:
X[:5]

Unnamed: 0,Year of Birth,Tests,Less than 5 mcg/dL,5-10 mcg/dL,10-15 mcg/dL,15+ mcg/dL,Total Elevated Blood Levels,"Rate per 1,000",Percent,Number of Outlets that Require Sampling,Results ≤ 15 ppb,Results ≤ 15 ppb.1,Results ≤ 15 ppb.2,Results ≤ 15 ppb.3
0,2019,30,30.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,63.0,0.0,63.0,0.0
1,2018,30,30.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0,95.0,0.0,95.0,0.0
2,2018,30,30.0,0.0,0.0,0.0,0.0,0.0,0.0,218.0,218.0,0.0,218.0,0.0
3,2017,10,10.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,81.0,1.0,81.0,1.0
4,2017,10,10.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,81.0,1.0,81.0,1.0


In [58]:
#create a Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

## Logistical Regression Model

In [59]:
#Create a Logistic Regression Model by using the training data 
lr_model=LogisticRegression(random_state=42)
#Fit the model using the training Data 
training_model=lr_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
#make a prediction using testing data 
testing_predictions=training_model.predict(X_test)

In [61]:
#Show the balanced accuracy score of the model
balanced_model=balanced_accuracy_score(y_test, testing_predictions) 
print(balanced_model)

0.00730234846558754


In [62]:
# Generate a confusion matrix for the model
matrix_model=confusion_matrix(y_test,testing_predictions)
print(matrix_model)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [63]:
# Print the classification report for the model
classification_model=classification_report(y_test,testing_predictions)
print(classification_model)

              precision    recall  f1-score   support

       10504       0.00      0.00      0.00         4
       10510       0.00      0.00      0.00         3
       10511       0.00      0.00      0.00         1
       10512       0.00      0.00      0.00         4
       10514       0.00      0.00      0.00         3
       10516       0.00      0.00      0.00         1
       10518       0.00      0.00      0.00         1
       10520       0.00      0.00      0.00         1
       10522       0.00      0.00      0.00         1
       10532       0.00      0.00      0.00         2
       10536       0.00      0.00      0.00         5
       10538       0.00      0.00      0.00         3
       10541       0.00      0.00      0.00         1
       10543       0.00      0.00      0.00         6
       10547       0.00      0.00      0.00         3
       10560       0.00      0.00      0.00         1
       10567       0.00      0.00      0.00         4
       10570       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Predicting a Logistical Regression Model with Resampled Training Data 
using RandomOverSampler

In [64]:
# RandomOverSampler to resample the data 
# # Assign a random_state parameter of 42
random_oversampler_model=RandomOverSampler(random_state=42)

# Fit the original training data to the random_oversampler model
X_oversample, y_oversample = random_oversampler_model.fit_resample(X_train,y_train)

In [65]:
# Count the distinct values of the resampled labels data
y_oversample.value_counts()

11554    68
14568    68
12533    68
14150    68
11786    68
         ..
13027    68
11719    68
11757    68
10583    68
14712    68
Name: Zip, Length: 534, dtype: int64

In [66]:
#Logistic Regression Model
random_state_model=LogisticRegression(solver='lbfgs', random_state=42)

# Fit the model using the resampled training data
random_state_model.fit(X_oversample,y_oversample)

# Make a prediction using the testing data
y_prediction=random_state_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, y_prediction)



0.0361774186966989

In [68]:
# Generate a confusion matrix for the model
print(confusion_matrix(y_test, y_prediction))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]


In [69]:
# Print the classification report for the model
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

       10504       0.00      0.00      0.00         4
       10510       0.00      0.00      0.00         3
       10511       0.00      0.00      0.00         1
       10512       0.00      0.00      0.00         4
       10514       0.00      0.00      0.00         3
       10516       0.00      0.00      0.00         1
       10518       0.00      0.00      0.00         1
       10520       0.00      0.00      0.00         1
       10522       0.00      0.00      0.00         1
       10532       0.00      0.00      0.00         2
       10536       0.00      0.00      0.00         5
       10538       0.00      0.00      0.00         3
       10541       0.00      0.00      0.00         1
       10543       0.00      0.00      0.00         6
       10547       0.00      0.00      0.00         3
       10560       0.00      0.00      0.00         1
       10566       0.00      0.00      0.00         0
       10567       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Neural Networks