# ML: Breast Cancer

In [2]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.2-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 0.0/81.9 kB ? eta -:--:--
   --------------- ------------------------ 30.7/81.9 kB 1.3 MB/s eta 0:00:01
   --------------- ------------------------ 30.7/81.9 kB 1.3 MB/s eta 0:00:01
   --------------- ------------------------ 30.7/81.9 kB 1.3 MB/s eta 0:00:01
   ------------------------------ --------- 61.4/81.9 kB 363.1 kB/s eta 0:00:01
   ------------------------------ --------- 61.4/81.9 kB 363.1 kB/s eta 0:00:01
   ------------------------------ --------- 61.4/81.9 kB 363.1 kB/s eta 0:00:01
   ------------------------------ -----

In [2]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import category_encoders as ce
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer


### Function to plot the learning curve 

In [2]:
def plot_learning_curves(model, x_train, y_train, x_val, y_val):
    train_sizes, train_scores, val_scores = learning_curve(model, x_train, y_train, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title("Learning Curves")
    plt.xlabel("Training Examples")
    plt.ylabel("Score")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color="g")

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

### Function to split the data 

In [3]:
def train_val_test_split(x, y):
    # Splitting into training (80%) and temporary set (20%)
    x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=3)
    # Splitting the temporary set into validation (50%) and test (50%)
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, shuffle=True, random_state=3)
    return x_train, y_train, x_val, y_val, x_test, y_test

In [4]:
# Used locally 
path = "C:\\Users\\aidaf\\OneDrive\\Documents\\University\\Spring 2023-2024\\CMPS 396V Machine Learning\\Project\\ML_Breast_Cancer\\Aida\\"
# path = "D:\\Desktop\\ML\\Project\\ML_Breast_Cancer\\Models\\"

# Used to load and save models for later use

def saveModel(fileName, model):
  with open(path + fileName, 'wb') as f:
      pickle.dump(model, f)

def getModel(fileName):
  with open(path + fileName, 'rb') as f:
      loaded_model = pickle.load(f)
  return loaded_model

In [5]:
results = [] # Array used to store tuples of (model, accuracy_score)

In [6]:
data = pd.read_csv("data.csv", encoding="utf-8")
data.shape

y = data["cancer_type"]
x = data.drop(columns = ["cancer_type", "patient_id"])

x_train, y_train, x_val, y_val, x_test, y_test = train_val_test_split(x, y)

  data = pd.read_csv("data.csv", encoding="utf-8")


In [7]:
def get_categorical_columns(df):
    return list(df.select_dtypes(include=['object', 'category']).columns)
def get_numerical_columns(df):
    return list(df.select_dtypes(include=['number']).columns)

categorical_columns = get_categorical_columns(x_train)
numerical_columns = get_numerical_columns(x_train)

numerical_imputer = SimpleImputer(strategy='mean')  # You can also use 'median', 'most_frequent', or 'constant'
x_train[numerical_columns] = numerical_imputer.fit_transform(x_train[numerical_columns])

categorical_imputer = SimpleImputer(strategy='most_frequent')  # You can also use 'median', 'most_frequent', or 'constant'
x_train[categorical_columns] = categorical_imputer.fit_transform(x_train[categorical_columns])

x_train.shape

(1196, 686)

In [8]:
# Clean categorical columns to remove duplicates and unexpected formats
def clean_categorical_columns(df, cat_columns):
    cleaned_df = df.copy()
    for col in cat_columns:
        cleaned_df[col] = cleaned_df[col].apply(lambda x: str(x).strip())  # Strip leading/trailing spaces
        cleaned_df[col] = cleaned_df[col].apply(lambda x: ' '.join(sorted(set(x.split()))))  # Remove duplicate words
    return cleaned_df

def clean_target_variable(y):
    cleaned_y = y.copy()
    cleaned_y = cleaned_y.apply(lambda x: str(x).strip())  # Strip leading/trailing spaces
    cleaned_y = cleaned_y.apply(lambda x: ' '.join(sorted(set(x.split()))))  # Remove duplicate words
    return cleaned_y

x_train_cleaned = clean_categorical_columns(x_train, categorical_columns)
y_train_cleaned = clean_target_variable(y_train)

# Use LabelEncoder for encoding target variable
label_encoder_y = LabelEncoder()
y_train_encoded = label_encoder_y.fit_transform(y_train_cleaned)

encoder = ce.TargetEncoder(cols=categorical_columns)
x_train = encoder.fit_transform(x_train_cleaned, y_train_encoded)


In [9]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf_classifier, n_features_to_select=None)

# Fit the classifier to the data
rfe.fit(x_train, y_train)

selected_features_indices = np.where(rfe.support_)[0]
x_train = x_train.iloc[:, selected_features_indices]


In [10]:
x_val_cleaned = clean_categorical_columns(x_val, categorical_columns)
y_val_cleaned = clean_target_variable(y_val)

y_val_encoded = label_encoder_y.fit_transform(y_val_cleaned)

encoder = ce.TargetEncoder(cols=categorical_columns)
x_val = encoder.fit_transform(x_val_cleaned, y_val_encoded)

## Part 3: Models

### XGBoost

In [11]:
y_train = LabelEncoder().fit_transform(y_train)
y_val = LabelEncoder().fit_transform(y_val)
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
}

xgb_model = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='f1_macro')
grid_search.fit(x_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_

# Refit the model with best hyperparameters
best_xgb_model = XGBClassifier(**best_params)
best_xgb_model.fit(x_val, y_val)

# Predict on validation data
x_val_encoded = encoder.transform(x_val)
xgb_predictions = best_xgb_model.predict(x_val_encoded)

best_score = grid_search.best_score_
print("Cross-validation score: ", best_score)

# Calculate performance metrics
accuracy = accuracy_score(y_val, xgb_predictions)
precision = precision_score(y_val, xgb_predictions, average="macro")
recall = recall_score(y_val, xgb_predictions, average="macro")
f1 = f1_score(y_val, xgb_predictions, average="macro")

print("accuracy =", accuracy)
print("precision =", precision)
print("recall =", recall)
print("f1-score =", f1)

# Save the best model
saveModel('best_xgb_model.pkl', best_xgb_model)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

Cross-validation score:  0.6101320180837355
accuracy = 0.8733333333333333
precision = 0.9547619047619048
recall = 0.557142857142857
f1-score = 0.6035121328224776


## Part 5: Testing the best model on the test data

In [12]:
x_test_cleaned = clean_categorical_columns(x_test, categorical_columns)
y_test_cleaned = clean_target_variable(y_test)

y_test_encoded = label_encoder_y.fit_transform(y_test_cleaned)

encoder = ce.TargetEncoder(cols=categorical_columns)
x_test = encoder.fit_transform(x_test_cleaned, y_test_encoded)
y_test = LabelEncoder().fit_transform(y_test)

test_pred = best_xgb_model.predict(x_test)

accuracy = accuracy_score(y_test, test_pred)

print("accuracy =", accuracy)

accuracy = 0.7866666666666666
precision = 0.20774647887323944
recall = 0.24180327868852458
f1-score = 0.22348484848484848


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
