In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

In [None]:
# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = np.ravel(spambase.data.targets)
  
# metadata 
#print(spambase.metadata) 
  
# variable information 
#print(spambase.variables) 

In [None]:
X.info()

In [None]:
X.head(5)

In [None]:
X.describe().T

## I. **Is there any missing values?**

In [None]:
X.isnull().sum()

In [None]:
"""
No missing data and there also no need to any categorical columns. All features types are int64 or float64.
"""

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## II. **Whats the difference between ROC Curve and AUC?**

### What is ROC Curve?

* 1. Receiver Operating Characteristic Curve
* 2. Plot of true positive rate (recall) vs false positive rate at various threshold settings.
* 3. Points above the diagonal rapresent good classification.
* 4. Ideal curve would just be a point in the upper left corner.
* 5. The more it's bend toward the upper left, the better

### What is AUC?

* 1. Area Under the Curve.
* 2. Equal to probability that a classifier will rank a randomly chosen positive instance higher than a randomly chosen negative one.
* 3. ROC AUC of 0.5 is a useless classifier, 1.0 is perfect.
* 4. Commonly used metric for comparing classifier.

## III. **Modeling with RandomForestClassifier.**

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
from sklearn.ensemble import RandomForestClassifier

I choose RandomForestClassifier because is easy to use, fast and robust. Also, it searches for the best feature among a random subset of features. This results in a wide diversity that generally results in a better model.

In [None]:
randomforest = RandomForestClassifier()

## IV. **Cross Valudation Checks.**

In [None]:
scores = cross_val_score(randomforest, X, y,
                              cv=5,
                              scoring='roc_auc')

print(f'Random Forest Cross-Validation: {np.mean(scores):.2f}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## V. **RandomForestClassifier Model**

In [None]:
randomforest.fit(X_train, y_train)
rf_predict = randomforest.predict(X_test)
print(f'RandomForest Accuracy Score: {accuracy_score(y_test, rf_predict):.2f}')

## VI. **ROC Curve & AUC**

In [None]:
from sklearn.metrics import roc_curve, auc, RocCurveDisplay

# Retrieve probabilities for the positive class with predict_proba methods
y_scores = randomforest.predict_proba(X_test)[:, 1]

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_scores)

# Compute AUC
roc_auc = auc(fpr, tpr)

# Plot using RocCurveDisplay
RocCurveDisplay(fpr=fpr, 
                tpr=tpr, 
                roc_auc=roc_auc, 
                estimator_name="RandomForestClassifier").plot()



plt.show()

## VII. **ConfusionMatrix.**

In [None]:
matrix = confusion_matrix(y_test, rf_predict)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

target_values = ['Spam', 'Not Spam']
tick_marks = np.arange(len(target_values))
tick_marks2 = tick_marks + 0.5

# Heatmap Plot
plt.figure(figsize=(8, 6))
sns.heatmap(matrix, annot=True, annot_kws={'size':12, 'fontweight':'bold'},
            cmap=plt.cm.Reds, linewidths=0.3, xticklabels=True, yticklabels=True)

# Setting up Ticks
plt.xticks(np.arange(len(target_values)) + 0.5, labels=target_values)
plt.yticks(np.arange(len(matrix)) + 0.5, labels=target_values, rotation=0)


plt.xlabel('Values Predicted', fontweight='bold')
plt.ylabel('True Label', fontweight='bold')
plt.title('Confusion Matrix RandomForest Heatmap', fontweight='bold')


plt.show()

## VIII. **Classification Report.**

In [None]:
print('\t\t\t Classification Report', classification_report(y_test, rf_predict), sep='\n')