In [None]:
# Import important library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

### Read the input file and check the data dimension

In [None]:
df = pd.read_csv('german_credit.csv')

In [None]:
# You can access from https://www.kaggle.com/uciml/german-credit
#Read input file and understand the data
# "default" is my dependent variable

In [None]:
df.shape

In [None]:
df.info()

### Q1 Randomly select 50% data for this use case( 1 Marks)
###### Hint: Use train_test_split

In [None]:
X=df.drop(labels= 'default' , axis = 1)
y=df['default']
X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5, random_state=8)

In [None]:
# Lets build a Ensemble model but need to modify the dataset first


### Q2.Prepare the model data by converting non-numeric to dummy ( 1 Marks)
##### Hint: Use get_dummies

In [None]:
# Print Shape of model data

In [None]:
X1=pd.get_dummies(X1)
print("The Shape of the model data X is:",X1.shape)

### Check for highly correlated variables but don't required any treatment for this use case

In [None]:
corr=X1.corr().abs()
corr[corr==1]=0
corr_max=corr.max().sort_values(ascending=False)
display(corr_max[corr_max>0.5])

### Drop the original variables which are converted to dummy

In [None]:
X1=pd.get_dummies(X1,drop_first=True)
print("The shape of the model data X after dropping the original variables is :",X1.shape)

### Q3 Split Train/Test data 70:30 ratio( 1 Marks)
##### Hint:from sklearn.model_selection import train_test_split

In [None]:
X=pd.get_dummies(X,drop_first=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

Instead of spliting the 50% data i am considering to split the actual data in 70:30 ratio

### Q4 Build Random Forest Model( 1 Marks)
#### Hint:from sklearn.ensemble import RandomForestClassifier using n_jobs=2,n_estimators=500,criterion="entropy",random_state=9999

In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_jobs=2,n_estimators = 500,criterion="entropy",random_state=9999)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)

### Q5 Calculate Confusion Matrix and Accuracy score (1 Marks)
##### Hint: Use confusion_matrix and accuracy_score

In [None]:
print("The Accuracy Score for Random Forest Model is :")
print(rfcl.score(X_test , y_test))
print("The Confusion Matrix for Random Forest Model is :")
print(metrics.confusion_matrix(y_test, y_predict))

### Q6 Show the list of the features importance( 1 Marks)

In [None]:
print (pd.DataFrame(rfcl.feature_importances_, columns = ["Imp"], index = X_train.columns))

### Q7 K-fold cross-validation( 2 Marks)
##### k-fold cross validation( without stratification)
##### Usually k is set as 10-20 in practical settings, depends on data set size

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [None]:
# Use below values
num_folds = 10
seed = 77

In [None]:
#Validate the Random Forest model build above using k fold

In [None]:
kf = KFold(n_splits=num_folds,random_state=seed, shuffle=True)
scores=cross_val_score(rfcl,X,y,cv=kf)
print(scores)

In [None]:
#Calculate Mean score

In [None]:
print ("The Mean Score for The Random Forest Model build using K Fold is :",np.mean(scores))

In [None]:
# Calculate score standard deviation using std()

In [None]:
print("The Standard Deviation Of The Calculated Score is :",np.std(scores))

# Q8 Print the confusion matrix( 1 Marks)

In [None]:
y_pred = cross_val_predict(rfcl, X, y, cv=kf)
conf_mat = metrics.confusion_matrix(y, y_pred)
print("The Confusion Matrix :")
print(conf_mat)

# Q9.Classification accuracy: 
percentage of correct predictions and Calculate sensitivity (or True Positive Rate or Recall) and Precision.
( 1 Marks)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y, y_pred))

# Q10.Plot Receiver Operating Characteristic (ROC) Curves( 1 Marks)

In [None]:
#Hint: Use roc_curve

In [None]:
from sklearn.metrics import roc_curve,auc
y_pred_prob = rfcl.predict_proba(X)
fpr, tpr, thresholds = roc_curve(y, y_pred_prob[::,1])

In [None]:
plt.plot(fpr, tpr, label='ROC curve')
plt.plot([0, 1], [0, 1], 'k--',label='Guess')
_ = plt.xlabel('False Positive Rate')
_ = plt.ylabel('True Positive Rate')
_ = plt.title('ROC Curve')
_ = plt.legend(loc="lower right")

ROC curve can help you to choose a threshold that balances sensitivity and specificity in a way that makes sense for your particular context

# Q11. Calculate AUC(the percentage of the ROC plot that is underneath the curve) - optional

In [None]:
roc_auc = auc(fpr, tpr)
roc_auc

### Bootstrapping ( Bonus)
##### Given a dataset of size n, a bootstrap sample is created by sampling n instances uniformly from the data (with/without replacement)
##### Create a model with each bootstrap sample and validate it with the test set
##### Final result is calculated by averaging the accuracy of models

In [None]:
# Number of iterations for bootstrapping
bootstrap_iteration = 10
accuracy = []

In [None]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

for i in range(bootstrap_iteration):
    X_, y_ = resample(X, y)
    rfcl.fit(X_, y_)
    y_pred = rfcl.predict(X)
    
    acc = accuracy_score(y_pred, y)
    accuracy.append(acc)

In [None]:
accuracy = np.array(accuracy)
print('Accuracy Score')
print('Avearge: ', accuracy.mean())
print('Standard deviation: ', accuracy.std())