## <font color = darkblue> Tree Based Models
    - Make sure to pip install xgboost
    - This program uses a breast cancer related data that comes with sklearn
    - For more information:
        https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(diagnostic)

<font color = red> The program also shows how to save (pickle) a model and call back later


In [1]:
# -------------------------
# Import dependencies
# -------------------------

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import class_weight

# -------------------------
# Load breast cancer data
# -------------------------

cancer = load_breast_cancer()

# --------------------------------------------------
# Note that the data comes as a util.bunch (dictionary)
# --------------------------------------------------
type(cancer)

# --------------------------------------------------
# Listing various attributes of the data structure
# --------------------------------------------------
print(list(cancer.keys()))

# cancerdf = 

['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']


In [2]:
# ------------------------------
# Creating the dataframe (df)
# ------------------------------

cancerdf = pd.DataFrame(cancer.data, columns = cancer.feature_names) 
cancerdf.head()

print(cancer.DESCR)

# ------------------------------
# Response variable
# 0 = Malignant, 1 = Benign
# ------------------------------
cancerdf["Response"]= cancer.target
cancerdf["Response"].value_counts()



.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

1    357
0    212
Name: Response, dtype: int64

## <font color = darkred> Details
    - As can be seen above that the response classes are slightly imbalanced (~ in the ratio of 4:7) 
    - Code for various tree based models is provided below

In [4]:
# ------------------------------------------------------
# Split the dataset into training and testing sets
# ------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(cancer.data[:,15:20], cancer.target, test_size=0.2, random_state=42)



## <font color = green> Decision tree
    - Single tree 
    - The example belows shows modeling for a binary classification problem


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight

# ---------------------------------------------------------------
# Train a decision tree classifier on the balanced dataset
# ---------------------------------------------------------------

clf = DecisionTreeClassifier(criterion='gini',
                             min_samples_split=5,
                             min_samples_leaf=5)
clf.fit(X_train, y_train)

# --------------------------------------------
# Evaluate the classifier on the testing set
# --------------------------------------------
y_pred_dt = clf.predict(X_test)
print(classification_report(y_test, y_pred_dt))

confusion_matrix(y_test, y_pred_dt)

# Confusion Matrix
pd.crosstab(y_pred_dt, y_test, rownames =['y_pred_dt'], colnames = ['y_test'] )


# -------------------------------------------------------
# Notes for reading the classification report
# -------------------------------------------------------

# --------------
# Precision: 
# --------------
# Measures the proportion of true positives (TP) out of all predicted positives (TP + false positives (FP)). 
# A high precision means the model makes few false positive predictions.

# --------------
# Recall: 
# --------------
# Measures the proportion of true positives (TP) out of all actual positives (TP + false negatives (FN)). \
# A high recall indicates that the model makes few false negative predictions.

# --------------
# F1-score: 
# --------------
# It is the harmonic mean of precision and recall, calculated as (2 * precision * recall) / (precision + recall). 
# It is useful especially when the classes are imbalanced 

# --------------
# Support: 
# --------------
# It is the number of observations in each class.

# --------------
# Weighted Average: 
# --------------
# It is calculated as the average of the precision, recall etc. weighted by the number of samples in each class.



              precision    recall  f1-score   support

           0       0.69      0.77      0.73        43
           1       0.85      0.79      0.82        71

    accuracy                           0.78       114
   macro avg       0.77      0.78      0.77       114
weighted avg       0.79      0.78      0.78       114



y_test,0,1
y_pred_dt,Unnamed: 1_level_1,Unnamed: 2_level_1
0,33,15
1,10,56


## <font color = green> Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ------------------
# Define the RF model
# ------------------
rf = RandomForestClassifier(n_estimators=1000,
                            criterion='entropy',
                            min_samples_split=5,
                            min_samples_leaf=5,
                            random_state=100)

# ------------------
# Train the models
# ------------------
rf.fit(X_train, y_train)

# ------------------------------------
# Make predictions on the test set
# ------------------------------------
y_pred_rf = rf.predict(X_test)

# ------------------------------------------------------------------------
# Evaluate the model using accuracy, precision, recall, and F1 score
# ------------------------------------------------------------------------
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Random Forest Precision:', precision_score(y_test, y_pred_rf))
print('Random Forest Recall:', recall_score(y_test, y_pred_rf))
print('Random Forest F1 Score:', f1_score(y_test, y_pred_rf))

print(classification_report(y_test, y_pred_rf))

# Confusion Matrix
pd.crosstab(y_pred_rf, y_test, rownames =['y_pred_rf'], colnames = ['y_test'] )


Random Forest Accuracy: 0.8508771929824561
Random Forest Precision: 0.8648648648648649
Random Forest Recall: 0.9014084507042254
Random Forest F1 Score: 0.8827586206896552
              precision    recall  f1-score   support

           0       0.82      0.77      0.80        43
           1       0.86      0.90      0.88        71

    accuracy                           0.85       114
   macro avg       0.84      0.83      0.84       114
weighted avg       0.85      0.85      0.85       114



y_test,0,1
y_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1
0,33,7
1,10,64


## <font color = darkgreen> Gradient Boosting Classifier

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

# ------------------
# Define the GB model
# ------------------
gb = GradientBoostingClassifier(n_estimators=500,
                            min_samples_split=5,
                            learning_rate=0.5,
                            min_samples_leaf=5,
                            random_state=100)

# ------------------
# Train the model
# ------------------
gb.fit(X_train, y_train)

# ------------------------------------
# Make predictions on the test set
# ------------------------------------
y_pred_gb = gb.predict(X_test)

# ------------------------------------------------------------------------
# Evaluate the model using accuracy, precision, recall, and F1 score
# ------------------------------------------------------------------------

print('Gradient Boosting Accuracy:', accuracy_score(y_test, y_pred_gb))
print('Gradient Boosting Precision:', precision_score(y_test, y_pred_gb))
print('Gradient Boosting Recall:', recall_score(y_test, y_pred_gb))
print('Gradient Boosting F1 Score:', f1_score(y_test, y_pred_gb))

print(classification_report(y_test, y_pred_gb))
# Confusion Matrix
pd.crosstab(y_pred_gb, y_test, rownames =['y_pred_gb'], colnames = ['y_test'] )


Gradient Boosting Accuracy: 0.7807017543859649
Gradient Boosting Precision: 0.8108108108108109
Gradient Boosting Recall: 0.8450704225352113
Gradient Boosting F1 Score: 0.8275862068965518
              precision    recall  f1-score   support

           0       0.72      0.67      0.70        43
           1       0.81      0.85      0.83        71

    accuracy                           0.78       114
   macro avg       0.77      0.76      0.76       114
weighted avg       0.78      0.78      0.78       114



y_test,0,1
y_pred_gb,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29,11
1,14,60


## <font color = Green> XGBoost
    - Documentation and tutorial
    https://xgboost.readthedocs.io/en/stable/tutorials/model.html
    
### <font color = red> Make sure to pip install xgboost library
    !pip install xgboost
 

In [None]:
# help(xgb)

In [8]:
import xgboost as xgb

# ------------------
# Define the XGB model
# ------------------
xgb = xgb.XGBClassifier(n_estimators=100,
                        max_depth = 10,
                        eta= 0.01,
                        min_child_weight = 5,
                        random_state=100)

# ------------------
# Train the model
# ------------------
xgb.fit(X_train, y_train)

# ------------------------------------
# Make predictions on the test set
# ------------------------------------
y_pred_xgb = xgb.predict(X_test)

# ------------------------------------------------------------------------
# Evaluate the model using accuracy, precision, recall, and F1 score
# ------------------------------------------------------------------------

print('XGB:', accuracy_score(y_test, y_pred_xgb))
print('XGB:', precision_score(y_test, y_pred_xgb))
print('XGB:', recall_score(y_test, y_pred_xgb))
print('XGB:', f1_score(y_test, y_pred_xgb))

print(classification_report(y_test, y_pred_xgb))
# Confusion Matrix
pd.crosstab(y_pred_xgb, y_test, rownames =['y_pred_xgb'], colnames = ['y_test'] )


XGB: 0.7894736842105263
XGB: 0.8133333333333334
XGB: 0.8591549295774648
XGB: 0.8356164383561644
              precision    recall  f1-score   support

           0       0.74      0.67      0.71        43
           1       0.81      0.86      0.84        71

    accuracy                           0.79       114
   macro avg       0.78      0.77      0.77       114
weighted avg       0.79      0.79      0.79       114



y_test,0,1
y_pred_xgb,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29,10
1,14,61


## <font color= blue> How to "pickle" a model and call back later 

In [24]:
import pickle

%cd '/Users/prashantmittal/Roux'


# Serialize the random forest classifier to a file
# Note that the file rf.pickle is created in the working directory

with open('rf.pickle', 'wb') as handle:
    pickle.dump(rf, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # Deserialize the classifier from the file
with open('rf.pickle', 'rb') as handle:
    classifier_loaded = pickle.load(handle)

#----------------------------------------------------------------
# # Use the loaded RF classifier to make predictions
#----------------------------------------------------------------
predicted_y = classifier_loaded.predict(X_test)

print(predicted_y)


/Users/prashantmittal/Roux
[1 0 0 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1
 1 0 1 1 0 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0
 1 1 0]
