In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('cancer_reg_updated.csv')

# Replace missing values with the mean value of the column
df.fillna(df.mean(), inplace=True)

# Encode the categorical variable 'State'
le = LabelEncoder()
df['avgDeathsPerYear'] = le.fit_transform(df['avgDeathsPerYear'])

# Separate the features and target variable
X = df.drop('avgDeathsPerYear', axis=1)
y = df['avgDeathsPerYear']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the KNN model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions on the test set and calculate the mean squared error
knn_predictions = knn.predict(X_test)
knn_mse = mean_squared_error(y_test, knn_predictions)
print('KNN MSE:', knn_mse)

# Calculate the median value of the target variable
median = y.median()

# Create a binary classification target variable based on whether the TARGET_deathRate is above or below the median
y_binary = (y > median).astype(int)
y_train_binary = (y_train > median).astype(int)
y_test_binary = (y_test > median).astype(int)

# Create and fit the KNN model
knn_binary = KNeighborsClassifier(n_neighbors=5)
knn_binary.fit(X_train, y_train_binary)

# Make predictions on the test set and calculate the accuracy, F1 score, and confusion matrix
knn_binary_predictions = knn_binary.predict(X_test)
knn_binary_accuracy = accuracy_score(y_test_binary, knn_binary_predictions)
knn_binary_f1 = f1_score(y_test_binary, knn_binary_predictions)
knn_binary_cm = confusion_matrix(y_test_binary, knn_binary_predictions)
print('KNN Accuracy:', knn_binary_accuracy)
print('KNN F1 Score:', knn_binary_f1)
print('KNN Confusion Matrix:')
print(knn_binary_cm)


KNN MSE: 6330.599344262295
KNN Accuracy: 0.8360655737704918
KNN F1 Score: 0.8299319727891157
KNN Confusion Matrix:
[[266  30]
 [ 70 244]]
