## Creating the K Nearest Neighbor Model 
## Isabel Delgado and Aniyah McWilliams 

In [1]:
# ! pip install plotly.express

In [6]:
! pip install nbformat



In [2]:
# importing all the necessary items 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import plotly.express as px

In [3]:
# now reading in the data 
data = pd.read_csv('CDC-2019-2021-2023-DATA.csv')
data.head()

  data = pd.read_csv('CDC-2019-2021-2023-DATA.csv')


Unnamed: 0.1,Unnamed: 0,BIRTHSEX,MENTHLTH,POORHLTH,ADDEPEV3,DECIDE,DIFFALON,ACEDEPRS,ACEDRINK,ACEDRUGS,ACEPRISN,ACEDIVRC,ACEPUNCH,ACEHURT1,ACESWEAR,ACETOUCH,ACETTHEM,ACEHVSEX,IYEAR
0,0,,0.0,0.0,0.0,No,No,No,No,No,No,Yes,Never,More than once,Never,Never,Never,Never,2019
1,1,,0.0,10.0,0.0,No,No,No,Yes,No,No,No,Never,Once,More than once,Never,Never,Never,2019
2,2,,30.0,0.0,0.0,No,No,No,No,No,No,No,,Never,Never,Never,Never,Never,2019
3,3,,0.0,0.0,0.0,No,,,,,,,,,,,,,2019
4,4,,0.0,,0.0,No,No,No,No,No,No,No,Never,Never,Never,Never,Never,Never,2019


In [4]:
data = data.drop(['Unnamed: 0'], axis=1) # need to drop this 

In [5]:
# dropping all the nan data
data = data.dropna()
print(data.shape)

(15790, 18)


In [6]:
# selecting what the x and the y

In [7]:
data.IYEAR.value_counts()

IYEAR
2023    9322
2021    3652
2019    2816
Name: count, dtype: int64

In [8]:
data.ADDEPEV3.value_counts()

ADDEPEV3
0.0    11213
1.0     4577
Name: count, dtype: int64

In [9]:
data = data.dropna(subset=['ADDEPEV3'])

In [10]:
data.ADDEPEV3.isnull().value_counts()

ADDEPEV3
False    15790
Name: count, dtype: int64

In [11]:
y = data['ADDEPEV3']
X = data[['BIRTHSEX', 'MENTHLTH', 'POORHLTH',
         'DECIDE', 'DIFFALON', 'IYEAR', 
        'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS','ACEPRISN', 
        'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR',
        'ACETOUCH','ACETTHEM', 'ACEHVSEX']]

In [12]:
nums = ['POORHLTH', 'MENTHLTH']
cats = ['IYEAR', 'BIRTHSEX', 'ACEDEPRS', 
        'DECIDE', 'DIFFALON', 'ACEDRINK', 
        'ACEDRUGS','ACEPRISN', 'ACEDIVRC', 
        'ACEPUNCH', 'ACEHURT1', 'ACESWEAR',
        'ACETOUCH','ACETTHEM', 'ACEHVSEX']

In [13]:
for c in cats:
    print(data[c].value_counts())

IYEAR
2023    9322
2021    3652
2019    2816
Name: count, dtype: int64
BIRTHSEX
Female    9138
Male      6652
Name: count, dtype: int64
ACEDEPRS
No     11815
Yes     3975
Name: count, dtype: int64
DECIDE
No     13135
Yes     2655
Name: count, dtype: int64
DIFFALON
No     13947
Yes     1843
Name: count, dtype: int64
ACEDRINK
No     11396
Yes     4394
Name: count, dtype: int64
ACEDRUGS
No     13693
Yes     2097
Name: count, dtype: int64
ACEPRISN
No     14307
Yes     1483
Name: count, dtype: int64
ACEDIVRC
No                     11059
Yes                     4459
Parents not married      272
Name: count, dtype: int64
ACEPUNCH
Never             12761
More than once     2241
Once                788
Name: count, dtype: int64
ACEHURT1
Never             11211
More than once     3342
Once               1237
Name: count, dtype: int64
ACESWEAR
Never             9277
More than once    5508
Once              1005
Name: count, dtype: int64
ACETOUCH
Never             13528
More than once     1444
Onc

In [14]:
data.dtypes

BIRTHSEX     object
MENTHLTH    float64
POORHLTH    float64
ADDEPEV3    float64
DECIDE       object
DIFFALON     object
ACEDEPRS     object
ACEDRINK     object
ACEDRUGS     object
ACEPRISN     object
ACEDIVRC     object
ACEPUNCH     object
ACEHURT1     object
ACESWEAR     object
ACETOUCH     object
ACETTHEM     object
ACEHVSEX     object
IYEAR         int64
dtype: object

In [15]:
preprocess = ColumnTransformer(transformers=[('encoder',OneHotEncoder(drop='first'),cats),
                                             ('numeric','passthrough',nums)])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42,stratify=y)

In [17]:
pipe=Pipeline([ ("preprocess", preprocess),
                ("scaler",StandardScaler()),
                ("knn",KNeighborsClassifier(weights="distance"))
])

In [18]:
from pandas.core.groupby.indexing import GroupByIndexingMixin
# going to try find the k next; before you fit the model you need to define k
param_grid = {"knn__n_neighbors": range(1, 41, 2)}
grid = GridSearchCV(pipe, param_grid, cv=5, scoring="balanced_accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...'distance'))])
,param_grid,"{'knn__n_neighbors': range(1, 41, 2)}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('encoder', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,5
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [19]:
results_df = pd.DataFrame(grid.cv_results_)

results_df["k"] = results_df["param_knn__n_neighbors"]
results_df["mean_score"] = results_df["mean_test_score"]

best_k = grid.best_params_["knn__n_neighbors"]
best_score = grid.best_score_

fig = px.line(
    results_df,
    x="k",
    y="mean_score",
    title=f"Cross-Validated Balanced Accuracy vs. K (best k = {best_k})",
    markers=True,
    labels={"k": "Number of Neighbors (k)", "mean_score": "Mean CV Balanced Accuracy"}
)


fig.add_scatter(
    x=[best_k],
    y=[best_score],
    mode="markers+text",
    text=[f"Best k = {best_k}"],
    textposition="top center",
    name="Best k"
)

fig.update_layout(hovermode="x unified")
fig.show()

In [20]:
pipe2 = Pipeline([
    ("preprocess", preprocess),
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=best_k,
    weights="distance"))
])

In [21]:
pipe2.fit(X_train, y_train)
y_pred = pipe2.predict(X_test)

In [22]:
acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)

print(f"Accuracy: {acc:.3f}")
print(f"Balanced accuracy: {bal_acc:.3f}")

Accuracy: 0.727
Balanced accuracy: 0.628


In [23]:
from sklearn.metrics import roc_curve, roc_auc_score

In [24]:
prob_test = pipe2.predict_proba(X_test)[:,1]
prob_test

array([1. , 0. , 0. , ..., 0.2, 0.5, 0. ], shape=(3158,))

In [25]:
fpr, tpr, thresholds = roc_curve(y_test, prob_test)

In [26]:
import plotly.graph_objects as go

In [27]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr,y=tpr, mode="lines", name='ROC Curves'))
fig

In [28]:
roc_auc_score(y_test,prob_test)

0.6999342215855522