<a href="https://colab.research.google.com/github/armandordorica/Advanced-Python/blob/master/boost_of_power_solved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # Boost of Power

 Add some power to your fraud detection algorithm by using a GradientTreeClassifier to leverage ensemble learning.

In [None]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

 ### Preprocessing

In [None]:
# Loading data
file_path = Path("../Resources/sba_loans_encoded.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,Year,Month,Amount,Term,Zip,CreateJob,NoEmp,RealEstate,RevLineCr,UrbanRural,...,City_WILLITS,City_WILMINGTON,City_WINDSOR,City_WINNETKA,City_WOODLAND,City_WOODLAND HILLS,City_WRIGHTWOOD,City_Watsonville,City_YORBA LINDA,City_YUBA CITY
0,2001,11,32812,36,92801,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2001,4,30000,56,90505,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,4,30000,36,92103,0,10,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2003,10,50000,36,92108,0,6,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2006,7,343000,240,91345,3,65,1,0,2,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Define features set
X = df_loans.copy()
X.drop("Default", axis=1, inplace=True)
X.head()

Unnamed: 0,Year,Month,Amount,Term,Zip,CreateJob,NoEmp,RealEstate,RevLineCr,UrbanRural,...,City_WILLITS,City_WILMINGTON,City_WINDSOR,City_WINNETKA,City_WOODLAND,City_WOODLAND HILLS,City_WRIGHTWOOD,City_Watsonville,City_YORBA LINDA,City_YUBA CITY
0,2001,11,32812,36,92801,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2001,4,30000,56,90505,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,4,30000,36,92103,0,10,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2003,10,50000,36,92108,0,6,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2006,7,343000,240,91345,3,65,1,0,2,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Define target vector
y = df_loans["Default"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ### Choose Optimal Learning Rate

In [None]:
# Choose learning rate
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    model = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=learning_rate,
        max_features=2,
        max_depth=3,
        random_state=0)
    model.fit(X_train_scaled,y_train.ravel())
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        model.score(
            X_train_scaled,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        model.score(
            X_test_scaled,
            y_test.ravel())))
    print()

Learning rate:  0.05
Accuracy score (training): 0.683
Accuracy score (validation): 0.657

Learning rate:  0.1
Accuracy score (training): 0.716
Accuracy score (validation): 0.670

Learning rate:  0.25
Accuracy score (training): 0.856
Accuracy score (validation): 0.764

Learning rate:  0.5
Accuracy score (training): 0.926
Accuracy score (validation): 0.821

Learning rate:  0.75
Accuracy score (training): 0.928
Accuracy score (validation): 0.819

Learning rate:  1
Accuracy score (training): 0.927
Accuracy score (validation): 0.844



 ### Build Model with Optimal Learning Rate

In [None]:
# Create GradientBoostingClassifier model
model = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=1,
    max_features=5,
    max_depth=3,
    random_state=0)

# Fit the model
model.fit(X_train_scaled,y_train.ravel())

# Score the model
print("Accuracy score (training): {0:.3f}".format(
    model.score(
        X_train_scaled,
        y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    model.score(
        X_test_scaled,
        y_test)))

Accuracy score (training): 1.000
Accuracy score (validation): 0.848


In [None]:
# Make predictions
predictions = model.predict(X_test_scaled)

# Generate accuracy score for predictions using y_test
accuracy_score(y_test, predictions)

0.8476190476190476

 ### Model Evaluation

In [None]:
# Generatring the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=[
        "Predicted 0",
        "Predicted 1"
    ]
)

display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,307,37
Actual 1,43,138


In [None]:
# Generate classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88       344
           1       0.79      0.76      0.78       181

    accuracy                           0.85       525
   macro avg       0.83      0.83      0.83       525
weighted avg       0.85      0.85      0.85       525

