# ML: Breast Cancer

In [2]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.2-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 0.0/81.9 kB ? eta -:--:--
   --------------- ------------------------ 30.7/81.9 kB 1.3 MB/s eta 0:00:01
   --------------- ------------------------ 30.7/81.9 kB 1.3 MB/s eta 0:00:01
   --------------- ------------------------ 30.7/81.9 kB 1.3 MB/s eta 0:00:01
   ------------------------------ --------- 61.4/81.9 kB 363.1 kB/s eta 0:00:01
   ------------------------------ --------- 61.4/81.9 kB 363.1 kB/s eta 0:00:01
   ------------------------------ --------- 61.4/81.9 kB 363.1 kB/s eta 0:00:01
   ------------------------------ -----

In [1]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import category_encoders as ce
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTEENN

### Function to plot the learning curve 

In [2]:
def plot_learning_curves(model, x_train, y_train, x_val, y_val):
    train_sizes, train_scores, val_scores = learning_curve(model, x_train, y_train, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title("Learning Curves")
    plt.xlabel("Training Examples")
    plt.ylabel("Score")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color="g")

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

### Function to split the data 

In [3]:
# Used locally 
path = "C:\\Users\\aidaf\\OneDrive\\Documents\\University\\Spring 2023-2024\\CMPS 396V Machine Learning\\Project\\ML_Breast_Cancer\\Aida\\"
# path = "D:\\Desktop\\ML\\Project\\ML_Breast_Cancer\\Models\\"

# Used to load and save models for later use

def saveModel(fileName, model):
  with open(path + fileName, 'wb') as f:
      pickle.dump(model, f)

def getModel(fileName):
  with open(path + fileName, 'rb') as f:
      loaded_model = pickle.load(f)
  return loaded_model

In [4]:
results = [] # Array used to store tuples of (model, accuracy_score)

In [5]:
data = pd.read_csv("data.csv", encoding="utf-8")
data.shape

y = data["cancer_type"]
x = data.drop(columns = ["cancer_type", "patient_id"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True, random_state=3)

  data = pd.read_csv("data.csv", encoding="utf-8")


In [82]:
data.isna().sum()
data.dropna(axis=0, inplace=True)
data.reset_index(inplace=True)

(1346, 686)

In [None]:
def get_categorical_columns(df):
    return list(df.select_dtypes(include=['object', 'category']).columns)
def get_numerical_columns(df):
    return list(df.select_dtypes(include=['number']).columns)

categorical_columns = get_categorical_columns(x_train)
numerical_columns = get_numerical_columns(x_train)

In [83]:
# Use LabelEncoder for encoding target variable
label_encoder_y = LabelEncoder()
y_train_encoded = label_encoder_y.fit_transform(y_train)

encoder = ce.TargetEncoder(cols=categorical_columns)
x_train = encoder.fit_transform(x_train, y_train_encoded)

In [None]:
smote_enn = SMOTEENN()
x_train, y_train = smote_enn.fit_resample(x_train, y_train)

In [84]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf_classifier, n_features_to_select=10)

# Fit the classifier to the data
rfe.fit(x_train, y_train)

selected_features_indices = np.where(rfe.support_)[0]
x_train = x_train.iloc[:, selected_features_indices]

KeyboardInterrupt: 

## Part 3: Models

### XGBoost

In [56]:
x_train.shape

(1346, 343)

In [58]:
y_train = LabelEncoder().fit_transform(y_train)

# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
}

xgb_model = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='f1_macro')
grid_search.fit(x_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_score = grid_search.best_score_
print("Best Cross-Validation Score:", best_score)

best_model = grid_search.best_estimator_
print("Best Model:", best_model)

# Save the best model
saveModel('best_xgb_model.pkl', best_model)

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100}
Best Cross-Validation Score: 0.5982582804799356
Best Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=9, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)


## Part 5: Testing the best model on the test data

In [59]:
categorical_columns = get_categorical_columns(x_test)
numerical_columns = get_numerical_columns(x_test)

numerical_imputer = SimpleImputer(strategy='mean')  # You can also use 'median', 'most_frequent', or 'constant'
x_test[numerical_columns] = numerical_imputer.fit_transform(x_test[numerical_columns])

categorical_imputer = SimpleImputer(strategy='most_frequent')  # You can also use 'median', 'most_frequent', or 'constant'
x_test[categorical_columns] = categorical_imputer.fit_transform(x_test[categorical_columns])


In [62]:
x_test_cleaned = clean_categorical_columns(x_test, categorical_columns)
y_test_cleaned = clean_target_variable(y_test)

y_test = LabelEncoder().fit_transform(y_test)

encoder = ce.TargetEncoder(cols=categorical_columns)
x_test = encoder.fit_transform(x_test_cleaned, y_test)

In [63]:
x_test = rfe.transform(x_test)

In [64]:
test_pred = best_model.predict(x_test)

accuracy = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred, average="macro")
recall = recall_score(y_test, test_pred, average="macro")
f1 = f1_score(y_test, test_pred, average="macro")

print("accuracy =", accuracy)
print("precision =", precision)
print("recall =", recall)
print("f1-score =", f1)

accuracy = 0.23333333333333334
precision = 0.2342032967032967
recall = 0.2630145278450363
f1-score = 0.1296145809147899
