# Module Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
import warnings 
import xgboost
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

warnings.simplefilter(action="ignore", category=UserWarning)

## Dataset Import

Using the Diabetes dataset.

In [None]:
# Read in the diabetes.csv dataset into "df" 
# YOUR CODEHERE
df =  pd.read_csv("diabetes.csv", index_col= False )

In [None]:
df.info()

In [None]:
df.head(6)

In [None]:
# Find the number of outcomes for both positive and negative. Normalize the result to get a precentage. 
# Assign the output to "outcomes"
# YOUR CODE HERE
outcomes = df.Outcome.value_counts(normalize=True)
print(outcomes)

## Training and Test Datasets

Let's split the data 70/30 into a training set (which we will use to build models) and a test set (on which we will evaluate any model we build).

In [None]:
# Set "X" to all independent variables and set "y" to the Outcome variable
# YOUR CODE HERE

# Set "X" to all independent variables and set "y" to the Outcome variable
X = df.drop('Outcome', axis=1)  # X contains all columns except 'Outcome'
y = df['Outcome']  # y contains the 'Outcome' column

# Split the data into training and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Encode string class values as integers to avoid errors in newer versions of XGBoost
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
y = label_encoder.transform(y)

In [None]:
# Split data into training and test set: "X_train, X_test, y_train, y_test"
# Test size should be 0.3 with random_state set to 7
# Set "eval_set" to the combination of X_test and y_test
# YOUR CODE HERE
# Split the data into training and test sets (test size = 0.3, random_state = 7)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# Set "eval_set" to the combination of X_test and y_test
eval_set = [(X_test, y_test)]
print(X_train.shape, X_test.shape)

In [None]:
print('Initializing xgboost.sklearn.XGBClassifier and starting training...')

#set "st" to the current datetime
# YOUR CODE HERE

st = datetime.now()
# Use xgboost.sklearn.XGBClassifier to create a model called "clf" with the following parameters:
#     objective: "binary:logistic" 
#     learning_rate: 0.05 
#     seed: 9616 
#     max_depth: 20 
#     gamma: 10 
#     n_estimators: 500
# YOUR CODE HERE
clf = xgboost.sklearn.XGBClassifier(
    objective="binary:logistic",
    learning_rate=0.05,
    seed=9616,
    max_depth=20,
    gamma=10,
    n_estimators=500)


# Fit the model with training data. Also use the following parameters:
#     early_stopping_rounds: 20 
#     eval_metric: "auc"
#     eval_set: eval_set
#     verbose: False
# YOUR CODE HERE
clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=eval_set, verbose=False)


print(datetime.now()-st)

# set "y_pred" to the clf predictions on the test dataset. 
# YOUR CODE HERE
y_pred = clf.predict(X_test)

# Get the accuracy score of the model set the output to "accuracy".
# YOUR CODE HERE
# from sklearn.metrics import accuracy_score
accuracy = accuracy_score(np.array(y_test).flatten(), y_pred)
print("Accuracy: %.10f%%" % (accuracy * 100.0))

# Get the ROC-AUC score for the model. 
# YOUR CODE HERE
# from sklearn.metrics import roc_auc_score

accuracy_per_roc_auc = roc_auc_score(np.array(y_test).flatten(), y_pred)
print("ROC-AUC: %.10f%%" % (accuracy_per_roc_auc * 100))



#### View the results of your model

In [None]:
xgboost.plot_importance(clf)