a)Here, we load the dataset using pandas and create a new column called 'cancer_probability_cat'. We use the pandas 'cut' function to divide the cancer_probability into 5 categories based on the probability range.

In [37]:
import pandas as pd

# Load the data
data = pd.read_csv('cancer.csv')

# Create 5 categories for cancer probabilities
prob_cancer = data['prob_cancer'].mean()
data['prob_cancer'].fillna(prob_cancer, inplace=True)
data['cancer_probability_cat'] = pd.cut(data['prob_cancer'], bins=[0, 0.2, 0.4, 0.6, 0.8, 1], labels=[1, 2, 3, 4, 5])
data.describe()


Unnamed: 0,weight,height,salads_per_week,veggies_fruits_per_day,aerobic_per_week,sports_per_week,survey.month,prob_cancer
count,375.0,129.0,375.0,374.0,381.0,381.0,385.0,385.0
mean,165.018667,116.945736,1.509333,2.15508,2.052493,0.947507,2008.557221,0.490485
std,106.373835,328.996042,1.554463,1.458088,1.904945,1.512187,0.470432,0.284649
min,0.0,60.0,0.0,0.0,0.0,0.0,2008.09,0.004486
25%,129.5,64.0,0.0,1.0,0.0,0.0,2008.1,0.248034
50%,148.0,68.0,1.0,2.0,2.0,0.0,2008.12,0.490485
75%,180.0,70.0,2.0,3.0,3.0,1.0,2009.04,0.721626
max,1450.0,2801.0,9.0,7.0,7.0,6.0,2009.06,0.998722


b)First, we check for missing or invalid data in the height column. If there's missing data, we calculate the mean height and fill the missing values with the mean.

In [38]:
import seaborn as sns
# Check for missing or invalid height data
print(data['height'].isnull().sum())
print(data['height'].describe())
print(data['weight'].describe())
print("----------")
abnormal_condition_height = data['height'] > 82 
filtered_data_height = data.loc[~abnormal_condition_height, 'height'] 
mean_height = filtered_data_height.mean()
data['height'].fillna(mean_height, inplace=True)
data = data[data['height'] <= 82]


#  Fill missing or invalid weight data with the mean weight
abnormal_condition_weight = data['weight'] > 350 
filtered_data_weight = data.loc[~abnormal_condition_weight, 'weight'] 
mean_weight = filtered_data_weight.mean()
data['weight'].fillna(mean_weight, inplace=True)
data = data[data['weight'] <=350 ]
print(data.describe())



256
count     129.000000
mean      116.945736
std       328.996042
min        60.000000
25%        64.000000
50%        68.000000
75%        70.000000
max      2801.000000
Name: height, dtype: float64
count     375.000000
mean      165.018667
std       106.373835
min         0.000000
25%       129.500000
50%       148.000000
75%       180.000000
max      1450.000000
Name: weight, dtype: float64
----------
           weight      height  salads_per_week  veggies_fruits_per_day  \
count  379.000000  379.000000       370.000000              369.000000   
mean   157.810298   67.420635         1.451351                2.162602   
std     41.516642    2.395270         1.434783                1.461688   
min     13.000000   60.000000         0.000000                0.000000   
25%    130.000000   67.420635         0.000000                1.000000   
50%    150.000000   67.420635         1.000000                2.000000   
75%    180.000000   67.420635         2.000000                3.000000   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weight'].fillna(mean_weight, inplace=True)


c)We remove rows with empty or invalid data in the 'current_smoking' column using the dropna function.

In [39]:
data.dropna(subset=['current_smoking'], inplace=True)


D)We fill missing or invalid data in the other columns with their respective mean values.

In [40]:
print(data['salads_per_week'].value_counts())
print("****")
print(data['veggies_fruits_per_day'].value_counts())
print("****")
print(data['healthy_diet'].value_counts())
print("****")
print(data['aerobic_per_week'].value_counts())
print("****")
print(data['sports_per_week'].value_counts())
print("****")
print(data['current_smoking'].value_counts())
print("****")
print(data['survey.month'].value_counts())

0.0    112
1.0    110
2.0     68
3.0     37
4.0     20
5.0     15
6.0      3
Name: salads_per_week, dtype: int64
****
1.0    126
2.0    109
3.0     55
4.0     30
0.0     18
5.0     10
6.0      9
7.0      7
Name: veggies_fruits_per_day, dtype: int64
****
Average           142
Below average      84
Healthy            79
Unhealthy          48
Very healthy        8
Very unhealthy      5
Name: healthy_diet, dtype: int64
****
0.0    94
1.0    82
2.0    62
3.0    55
4.0    27
5.0    24
6.0    17
7.0    10
Name: aerobic_per_week, dtype: int64
****
0.0    221
1.0     62
2.0     43
3.0     13
5.0     11
6.0     11
4.0     10
Name: sports_per_week, dtype: int64
****
Never              361
Once in a while     10
Some days            1
Name: current_smoking, dtype: int64
****
2008.09    66
2008.12    64
2008.10    63
2009.04    62
2009.06    60
2009.03    57
Name: survey.month, dtype: int64


In [41]:
additional_columns = ['current_smoking', 'salads_per_week']
data.drop(columns=additional_columns, inplace=True)
data['healthy_diet'] = data['healthy_diet'].map({'Average': 1, 'Below average': 2, 'Healthy': 3, 'Unhealthy': 4,'Very healthy':5,'Very unhealthy':6})

In [42]:
data.fillna(data.mean(), inplace=True)

  data.fillna(data.mean(), inplace=True)


F)We split the data into training and testing sets using a 70/30 ratio and standardize the data using StandardScaler.

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select features and target
X = data.drop('cancer_probability_cat', axis=1)
y = data['cancer_probability_cat']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


G)We train the KNN classifier with k=5 and use it to classify the test data.

In [45]:
from sklearn.neighbors import KNeighborsClassifier

# Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Classify the test data
y_pred = knn.predict(X_test)


H)We calculate and print the confusion matrix, accuracy, and R2 score for the model's performance on the test set.

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# R2 Score
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)


Confusion Matrix:
 [[11 11  1  0  0]
 [ 6  9  4  1  0]
 [ 1  6 14  1  2]
 [ 0  1 11  6  2]
 [ 0  0  3  5 17]]
Accuracy: 0.5089285714285714
R2 Score: 0.6463339494762785


There are several preprocessing methods you can try to improve the results. Here, we'll perform data normalization using MinMaxScaler.

In [49]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Train the KNN classifier with normalized data
knn_normalized = KNeighborsClassifier(n_neighbors=5)
knn_normalized.fit(X_train_normalized, y_train)

# Classify the test data
y_pred_normalized = knn_normalized.predict(X_test_normalized)

# Confusion matrix
cm_normalized = confusion_matrix(y_test, y_pred_normalized)
print("Confusion Matrix (Normalized):\n", cm_normalized)

# Accuracy
accuracy_normalized = accuracy_score(y_test, y_pred_normalized)
print("Accuracy (Normalized):", accuracy_normalized)

# R2 Score
r2_normalized = r2_score(y_test, y_pred_normalized)
print("R2 Score (Normalized):", r2_normalized)


Confusion Matrix (Normalized):
 [[13  9  1  0  0]
 [ 4  9  7  0  0]
 [ 0  3 19  1  1]
 [ 0  0  9  9  2]
 [ 0  0  1  6 18]]
Accuracy (Normalized): 0.6071428571428571
R2 Score (Normalized): 0.7714109673444238


Finding the Best K for normalized data

In [56]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': list(range(1, 31))}

# Initialize the KNN Regressor model
knn = KNeighborsClassifier()

# Set up the GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

# Fit the model to the training data
grid_search.fit(X_train_normalized, y_train)

# Find the best 'k' value
best_k = grid_search.best_params_['n_neighbors']
print("Best k value:", best_k)

# Use the best KNN model
best_knn = grid_search.best_estimator_
y_pred_KNN = best_knn.predict(X_test_normalized)
r2_KNN = r2_score(y_test, y_pred_KNN)
# Confusion matrix
cm_normalized_KNN = confusion_matrix(y_test, y_pred_KNN)
print("Confusion Matrix (Normalized):\n", cm_normalized_KNN)
# Accuracy
accuracy_normalized_KNN = accuracy_score(y_test, y_pred_KNN)
print("Accuracy (Normalized):", accuracy_normalized_KNN)

# R2 Score
r2_normalized_KNN = r2_score(y_test, y_pred_KNN)
print("R2 Score (Normalized):", r2_KNN)

Best k value: 10
Confusion Matrix (Normalized):
 [[16  5  2  0  0]
 [ 2 12  6  0  0]
 [ 0  4 18  1  1]
 [ 0  0 10  7  3]
 [ 0  0  0  5 20]]
Accuracy (Normalized): 0.6517857142857143
R2 Score (Normalized): 0.7929759704251387
