In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

In [None]:
# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = np.ravel(spambase.data.targets)
  
# metadata 
#print(spambase.metadata) 
  
# variable information 
#print(spambase.variables) 

In [None]:
X.info()

In [None]:
X.head()

In [None]:
X.describe().T

In [None]:
X.isnull().sum()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

### ROC Curve.

* Receiver Operating Characteristic Curve
* Plot of true positive rate (recall) vs false positive rate at various threshold settings.
* Points above the diagonal rapresent good classification.
* ideal curve would just be a point in the upper left corner.
* the more it's bend toward the upper left, the better

### AUC.

* Area Under the Curve.
* Equal to probability that a classifier will rank a randomly chosen positive instance higher than a randomly chosen negative one.
* ROC AUC of 0.5 is a useless classifier, 1.0 is perfect.
* Commonly used metric for comparing classifier.

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
randomforest = RandomForestClassifier()

In [None]:
scores = cross_val_score(randomforest, X, y,
                              cv=5,
                              scoring='roc_auc')

print('Random Forest', np.mean(scores))

### ConfusionMatrix 

* Allows us to understand true positives and true negative, as well as false positives and false negatives.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
randomforest.fit(X_train, y_train)
rf_predict = randomforest.predict(X_test)
print('RandomForest:', accuracy_score(y_test, rf_predict))

In [None]:
matrix = confusion_matrix(y_test, rf_predict)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

target_values = ['Spam', 'Not Spam']
tick_marks = np.arange(len(target_values))
tick_marks2 = tick_marks + 0.5

# Heatmap Plot
plt.figure(figsize=(8, 6))
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Reds, linewidths=0.2, xticklabels=True, yticklabels=True)

# Ticks
plt.xticks(np.arange(len(target_values)) + 0.5, labels=target_values)
plt.yticks(np.arange(len(matrix)) + 0.5, labels=target_values, rotation=0)


plt.xlabel('Values Predicted')
plt.ylabel('True Label')
plt.title('Confusion Matrix RandomForest Heatmap', fontweight='bold')


plt.show()

### Classification Report.

In [None]:
print(classification_report(y_test, rf_predict))