# Model Experimentation

**Problem statement:** \
The main goal of this project is to predict whether the tumor is Benign or Malignant. The problem is classification problem.

### Import required packages and data

In [1]:
import numpy as np
import pandas as pd

# preprocessing and metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# modelling
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
data = pd.read_csv('../data/data_cleaned.csv')
data.head()

Unnamed: 0,diagnosis,smoothness_mean,symmetry_mean,fractal_dimension_mean,texture_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,smoothness_worst,symmetry_worst,fractal_dimension_worst
0,M,0.1184,0.2419,0.07871,0.9053,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,0.1622,0.4601,0.1189
1,M,0.08474,0.1812,0.05667,0.7339,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,0.1238,0.275,0.08902
2,M,0.1096,0.2069,0.05999,0.7869,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,0.1444,0.3613,0.08758
3,M,0.1425,0.2597,0.09744,1.156,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,0.2098,0.6638,0.173
4,M,0.1003,0.1809,0.05883,0.7813,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,0.1374,0.2364,0.07678


### Data Preprocessing

#### Splitting the data into independent and dependent variables

In [3]:
# splitting the data into features and target
X, y = data.drop("diagnosis", axis=1), data["diagnosis"]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

len(X_train), len(y_train), len(X_test), len(y_test)

(455, 455, 114, 114)

#### Transforming the data

In [4]:
# num features
num_features = list(X.columns)

# initialise the preprocessors
scaler = StandardScaler()
label_enc = LabelEncoder()


# Create a transformer
preprocessor = ColumnTransformer(
    [
        ("num_transformer", scaler, num_features)
    ]
)

# Transform the features
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
y_train = label_enc.fit_transform(y_train)
y_test = label_enc.transform(y_test)

In [6]:
label_enc.classes_

array(['B', 'M'], dtype=object)

#### Modelling Experimentation

In [5]:
def evaluate_model(true, predicted):
    acc = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    return (acc, precision, recall)

In [6]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False)
}

hyperparameters = {
    "Logistic Regression": {
        "penalty": ["l1", "l2"],
        "C": [0.1, 1, 10]
    },
    "SVC": {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"]
    },
    "K-Neighbors Classifier": {
        "n_neighbors": [3, 5, 7],
        "weights": ["uniform", "distance"]
    },
    "Naive Bayes": {},
    "Decision Tree": {
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5, 10]
    },
    "Random Forest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5, 10]
    },
    "AdaBoost": {
        "n_estimators": [50, 100, 150],
        "learning_rate": [0.1, 0.5, 1]
    },
    "Gradient Boosting": {
        "n_estimators": [50, 100, 150],
        "learning_rate": [0.1, 0.5, 1],
        "max_depth": [3, 5, 7]
    },
    "XGBoost": {
        "n_estimators": [50, 100, 150],
        "learning_rate": [0.1, 0.5, 1],
        "max_depth": [3, 5, 7]
    },
    "CatBoost": {
        "iterations": [50, 100, 150],
        "learning_rate": [0.1, 0.5, 1]
    }
}

model_list = []
acc_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    param = hyperparameters[list(models.keys())[i]]

    # hyperparameter tuning
    gs_model = GridSearchCV(estimator=model, param_grid=param, cv=3, refit=True,
                            n_jobs=-1, verbose=False)
    gs_model.fit(X_train, y_train)

    model.set_params(**gs_model.best_params_)

    # fitting the best model
    model.fit(X_train, y_train)

    # making predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    train_acc, train_precision, train_recall = evaluate_model(
        y_train, train_preds)
    test_acc, test_precision, test_recall = evaluate_model(y_test, test_preds)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy score: {:.4f}".format(train_acc))
    print("- Precision score: {:.4f}".format(train_precision))
    print("- Recall Score: {:.4f}".format(train_recall))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy score: {:.4f}".format(test_acc))
    print("- Precision score: {:.4f}".format(test_precision))
    print("- Recall Score: {:.4f}".format(test_recall))
    acc_list.append(test_acc)

    print('='*35)
    print('\n')

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Documents\Internships\CodersCave\breast_cancer_prediction\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Documents\Internships\CodersCave\breast_cancer_prediction\venv\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\Documents\Internships\CodersCave\breast_cancer_prediction\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.pena

Logistic Regression
Model performance for Training set
- Accuracy score: 0.9231
- Precision score: 0.9036
- Recall Score: 0.8876
----------------------------------
Model performance for Test set
- Accuracy score: 0.8947
- Precision score: 0.8605
- Recall Score: 0.8605


SVC
Model performance for Training set
- Accuracy score: 0.9451
- Precision score: 0.9615
- Recall Score: 0.8876
----------------------------------
Model performance for Test set
- Accuracy score: 0.8772
- Precision score: 0.8919
- Recall Score: 0.7674


K-Neighbors Classifier
Model performance for Training set
- Accuracy score: 0.9121
- Precision score: 0.8957
- Recall Score: 0.8639
----------------------------------
Model performance for Test set
- Accuracy score: 0.8509
- Precision score: 0.8250
- Recall Score: 0.7674


Naive Bayes
Model performance for Training set
- Accuracy score: 0.7802
- Precision score: 0.7285
- Recall Score: 0.6509
----------------------------------
Model performance for Test set
- Accuracy sc

#### Results

In [7]:
pd.DataFrame(list(zip(model_list, acc_list)), columns=[
             'Model Name', 'Accuracy Score']).sort_values(by=["Accuracy Score"], ascending=False)

Unnamed: 0,Model Name,Accuracy Score
0,Logistic Regression,0.894737
5,Random Forest,0.894737
6,AdaBoost,0.885965
1,SVC,0.877193
7,Gradient Boosting,0.877193
8,XGBoost,0.877193
9,CatBoost,0.859649
2,K-Neighbors Classifier,0.850877
4,Decision Tree,0.833333
3,Naive Bayes,0.745614


Logistic Regression and Random Forest are best performing models. So let's select the Random Forest model as the best model as it can learn complex relations among the data.