# Classification Metrics

## 1. Classification Accuracy

In [1]:
##Cross Validation Classification Accuracy
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

df = pd.read_csv(url, names = names)
arr = df.values

X = arr[:, 0:8]
y = arr[:, 8]

kfold = model_selection.KFold(n_splits = 10,
                             random_state=7,
                             shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X, y, cv = kfold, 
                                          scoring = scoring)

In [2]:
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.771 (0.051)


## 2. Log Loss

In [3]:
##Cross Validation Classification LogLoss
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

df = pd.read_csv(url, names = names)
arr = df.values

X = arr[:, 0:8]
y = arr[:, 8]

kfold = model_selection.KFold(n_splits = 10,
                             random_state=7,
                             shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'neg_log_loss'
results = model_selection.cross_val_score(model, X, y, cv = kfold, 
                                          scoring = scoring)

In [4]:
print("Logloss: %.3f (%.3f)" % (results.mean(), results.std()))

Logloss: -0.494 (0.042)


## 3. Area Under ROC Curve

In [5]:
##Cross Validation Classification ROC AUC
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

df = pd.read_csv(url, names = names)
arr = df.values

X = arr[:, 0:8]
y = arr[:, 8]

kfold = model_selection.KFold(n_splits = 10,
                             random_state=7,
                             shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X, y, cv = kfold, 
                                          scoring = scoring)

In [6]:
print("AUC: %.3f (%.3f)" % (results.mean(), results.std()))

AUC: 0.826 (0.050)


## 4. Confusion Matrix

In [7]:
##Cross Validation Classification Confusion Matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

df = pd.read_csv(url, names = names)
arr = df.values

X = arr[:, 0:8]
y = arr[:, 8]
test_size = 0.33

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = test_size,
                                                   random_state=7)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(y_test, predicted)

In [8]:
matrix

array([[141,  21],
       [ 41,  51]], dtype=int64)

## 5. Classification Report

In [9]:
##Cross Validation Classification Report
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

df = pd.read_csv(url, names = names)
arr = df.values

X = arr[:, 0:8]
y = arr[:, 8]
test_size = 0.33

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = test_size,
                                                   random_state=7)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

predicted = model.predict(X_test)
report = classification_report(y_test, predicted)

In [10]:
report

'              precision    recall  f1-score   support\n\n         0.0       0.77      0.87      0.82       162\n         1.0       0.71      0.55      0.62        92\n\n    accuracy                           0.76       254\n   macro avg       0.74      0.71      0.72       254\nweighted avg       0.75      0.76      0.75       254\n'

# Regression Metrics

## 1. Mean Absolute Error

In [11]:
##Cross Validation Regression MAE
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

df = pd.read_csv(url, delim_whitespace=True, names=names)
arr = df.values

X = arr[:, 0:13]
y = arr[:, 13]

kfold = model_selection.KFold(n_splits = 10,
                             random_state = 7,
                             shuffle = True)
model = LinearRegression()
scoring = 'neg_mean_absolute_error'
results = model_selection.cross_val_score(model, X, y, cv = kfold, scoring = scoring)

In [12]:
print("MAE: %.3f (%.3f)" % (results.mean(), results.std()))

MAE: -3.387 (0.667)


## 2. Mean Squared Error

In [13]:
##Cross Validation Regression MSE
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

df = pd.read_csv(url, delim_whitespace=True, names=names)
arr = df.values

X = arr[:, 0:13]
y = arr[:, 13]

kfold = model_selection.KFold(n_splits = 10,
                             random_state = 7,
                             shuffle = True)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, y, cv = kfold, scoring = scoring)

In [14]:
print("MSE: %.3f (%.3f)" % (results.mean(), results.std()))

MSE: -23.747 (11.143)


## 3. R^2 Metric

In [15]:
##Cross Validation Regression R^2
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

df = pd.read_csv(url, delim_whitespace=True, names=names)
arr = df.values

X = arr[:, 0:13]
y = arr[:, 13]

kfold = model_selection.KFold(n_splits = 10,
                             random_state = 7,
                             shuffle = True)
model = LinearRegression()
scoring = 'r2'
results = model_selection.cross_val_score(model, X, y, cv = kfold, scoring = scoring)

In [16]:
print("R^2: %.3f (%.3f)" % (results.mean(), results.std()))

R^2: 0.718 (0.099)
