In [27]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, model_selection
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics

In [17]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pd.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LinearRegression()

# Regression

## Mean Absolute Error
The Mean Absolute Error (or MAE) is the sum of the absolute differences between predictions and actual values.

In [22]:
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring='neg_median_absolute_error')
print("MAE: {:.3f} ({:.3f})".format(results.mean(), results.std()))

MAE: -3.398 (1.675)


## Mean Squared Error
The Mean Squared Error (or MSE) is much like the mean absolute error in that it provides a gross idea of the magnitude of error.
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [23]:
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: {:.3f} ({:.3f})".format(results.mean(), results.std()))

MSE: -34.705 (45.574)


## R Squared

In [24]:
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring='r2')
print("R^2: {:.3f} ({:.3f})".format(results.mean(), results.std()))

R^2: 0.203 (0.595)


# Classification

In [28]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()

## Classification Accuracy

In [30]:
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy: {:.3f} ({:.3f})".format(results.mean(), results.std()))

Accuracy: 0.770 (0.048)


## Logarithmic Loss

In [31]:
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring='neg_log_loss')
print("LogLoss: {:.3f} ({:.3f})".format(results.mean(), results.std()))

LogLoss: -0.493 (0.047)


## Area Under ROC Curve

In [32]:
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC: {:.3f} ({:.3f})".format(results.mean(), results.std()))

AUC: 0.824 (0.041)


## Confusion Matrix

In [35]:
test_size = 0.33
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print('healthy', 'sick')
print(matrix)
# diagonal - correct predictions

healthy sick
[[141  21]
 [ 41  51]]


## Classification Report

In [36]:
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)

             precision    recall  f1-score   support

        0.0       0.77      0.87      0.82       162
        1.0       0.71      0.55      0.62        92

avg / total       0.75      0.76      0.75       254

