In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
data = pd.read_csv("project4.csv")

In [2]:
## treating "?" values in node-caps with "no" because that is the "mode" or most frequent value
data["node-caps"] = data["node-caps"].replace("?","no")

In [3]:
## treating "?" values in breast-quad with "left_low" because that is the "mode" or most frequent value
data["breast-quad"] = data["breast-quad"].replace("?","left_low")

In [4]:
## converting columns to category type for one-hot encoding
data['class'] = data['class'].astype("category")
data['menopause'] = data['menopause'].astype("category")
data['node-caps'] = data['node-caps'].astype("category")
data['breast'] = data['breast'].astype("category")
data['breast-quad'] = data['breast-quad'].astype("category")
data['irradiat'] = data['irradiat'].astype("category")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   class        286 non-null    category
 1   age          286 non-null    object  
 2   menopause    286 non-null    category
 3   tumor-size   286 non-null    object  
 4   inv-nodes    286 non-null    object  
 5   node-caps    286 non-null    category
 6   deg-malig    286 non-null    int64   
 7   breast       286 non-null    category
 8   breast-quad  286 non-null    category
 9   irradiat     286 non-null    category
dtypes: category(6), int64(1), object(3)
memory usage: 11.6+ KB


In [6]:
data = pd.get_dummies(data, columns=["class", "menopause", "node-caps", 'breast','breast-quad','irradiat'], drop_first=True)

In [7]:
## doing data type conversion on age, tumor-size, and inv-nodes by replacing ranges with their midpoint
data["age"] = data["age"].replace({'20-29':'25', '30-39':'35','40-49':'45','50-59':'55','60-69':'65','70-79':'75'})
data["tumor-size"] = data["tumor-size"].replace({'0-4':'2', '5-9':'7','10-14':'12','15-19':'17','20-24':'22','25-29':'27', '30-34':'32','35-39':'37','40-44':'42','45-49':'47','50-54':'52'})
data["inv-nodes"] = data["inv-nodes"].replace({'0-2':'1', '3-5':'4','6-8':'7','9-11':'10','12-14':'13','15-17':'16','24-26':'25'})

In [8]:
## converting the columns to ints
data["age"] = data["age"].astype(int)
data["tumor-size"] = data["tumor-size"].astype(int)
data["inv-nodes"] = data["inv-nodes"].astype(int)

In [9]:
X = data.drop('class_recurrence-events', axis=1)
y = data['class_recurrence-events']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

In [10]:
pip install xgboost

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
import xgboost as xgb

In [14]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}
# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the RandomizedSearchCV object
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy',n_jobs=-1)

# Fit the RandomizedSearchCV object to the training data
grid_search.fit(x_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best set of hyperparameters:  {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 5, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.9}
Best score:  0.7699999999999999


In [15]:
from sklearn.metrics import classification_report

In [17]:
acc_xgb = grid_search.best_estimator_
print(f"Performance on TEST\n*******************\n{classification_report(y_test, acc_xgb.predict(x_test))}")
print(f"Performance on TRAIN\n********************\n{classification_report(y_train, acc_xgb.predict(x_train))}")


Performance on TEST
*******************
              precision    recall  f1-score   support

       False       0.72      0.85      0.78        60
        True       0.40      0.23      0.29        26

    accuracy                           0.66        86
   macro avg       0.56      0.54      0.54        86
weighted avg       0.62      0.66      0.63        86

Performance on TRAIN
********************
              precision    recall  f1-score   support

       False       0.81      0.95      0.87       141
        True       0.79      0.46      0.58        59

    accuracy                           0.81       200
   macro avg       0.80      0.70      0.73       200
weighted avg       0.80      0.81      0.79       200



In [27]:

# Create the RandomizedSearchCV object
grid_recall = GridSearchCV(xgb_model, param_grid, cv=5, scoring='recall',n_jobs=-1)

# Fit the RandomizedSearchCV object to the training data
grid_recall.fit(x_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_recall.best_params_)
print("Best score: ", grid_recall.best_score_)

Best set of hyperparameters:  {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 1, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 0.8}
Best score:  0.4909090909090909


In [28]:
recall_xgb = grid_recall.best_estimator_
print(f"Performance on TEST\n*******************\n{classification_report(y_test, recall_xgb.predict(x_test))}")
print(f"Performance on TRAIN\n********************\n{classification_report(y_train, recall_xgb.predict(x_train))}")


Performance on TEST
*******************
              precision    recall  f1-score   support

       False       0.73      0.80      0.76        60
        True       0.40      0.31      0.35        26

    accuracy                           0.65        86
   macro avg       0.56      0.55      0.55        86
weighted avg       0.63      0.65      0.64        86

Performance on TRAIN
********************
              precision    recall  f1-score   support

       False       0.92      0.99      0.95       141
        True       0.98      0.78      0.87        59

    accuracy                           0.93       200
   macro avg       0.95      0.89      0.91       200
weighted avg       0.93      0.93      0.93       200



In [29]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': np.arange(3, 20, 1),
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8],
    'gamma': [0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

In [30]:
# Create the RandomizedSearchCV object
grid_recall2 = GridSearchCV(xgb_model, param_grid, cv=5, scoring='recall',n_jobs=-1)

# Fit the RandomizedSearchCV object to the training data
grid_recall2.fit(x_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_recall2.best_params_)
print("Best score: ", grid_recall2.best_score_)

Best set of hyperparameters:  {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 1, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 0.8}
Best score:  0.4909090909090909


In [32]:
recall_xgb2 = grid_recall2.best_estimator_
print(f"Performance on TEST\n*******************\n{classification_report(y_test, recall_xgb2.predict(x_test))}")
print(f"Performance on TRAIN\n********************\n{classification_report(y_train, recall_xgb2.predict(x_train))}")


Performance on TEST
*******************
              precision    recall  f1-score   support

       False       0.73      0.80      0.76        60
        True       0.40      0.31      0.35        26

    accuracy                           0.65        86
   macro avg       0.56      0.55      0.55        86
weighted avg       0.63      0.65      0.64        86

Performance on TRAIN
********************
              precision    recall  f1-score   support

       False       0.92      0.99      0.95       141
        True       0.98      0.78      0.87        59

    accuracy                           0.93       200
   macro avg       0.95      0.89      0.91       200
weighted avg       0.93      0.93      0.93       200



In [33]:
import pickle
with open('acc_xgb', 'wb') as f:
    pickle.dump(acc_xgb, f)

In [34]:
with open('recall_xgb', 'wb') as f:
    pickle.dump(recall_xgb, f)

In [35]:
with open('recall_xgb2', 'wb') as f:
    pickle.dump(recall_xgb2, f)