STandard Methodology using a basic model on the Iris dataset to learn all the perks of Sklearn Packages

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, roc_curve
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

# Step by Step

In [3]:
# Loading the data
iris = load_iris()
X, y = iris['data'], iris['target']

In [13]:
print(iris['feature_names'])
iris['target_names']

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [28]:
# Divide Train / Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Building the model (on the train data)
clf = LinearRegression()
clf = clf.fit(X_train, y_train)

In [30]:
y_pred_train = clf.predict(X_train).round(0)
y_pred_test = clf.predict(X_test).round(0)

In [31]:
print(confusion_matrix(y_train, y_pred_train, labels=[0, 1, 2]))
print('------------')
print(confusion_matrix(y_test, y_pred_test, labels=[0, 1, 2]))

[[40  0  0]
 [ 0 39  2]
 [ 0  1 38]]
------------
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [32]:
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.9254199044989623, 0.9468960016420045)

In [34]:
print(classification_report(y_train, y_pred_train))
print('------------------------------------------------------')
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       0.97      0.95      0.96        41
           2       0.95      0.97      0.96        39

    accuracy                           0.97       120
   macro avg       0.97      0.98      0.97       120
weighted avg       0.98      0.97      0.98       120

------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



# Pipeline

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [46]:
# Loading the data
iris = load_iris()
X, y = iris['data'], iris['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
scaler = MinMaxScaler()
clf = LinearRegression()

In [54]:
pipeline = Pipeline([('scaler', scaler), ('clf', clf)], verbose=False)

In [55]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('clf',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [58]:
yp_train = pipeline.predict(X_train)
yp_test = pipeline.predict(X_test)

In [60]:
print(pipeline.score(X_train, y_train))
pipeline.score(X_test, y_test)

0.9254199044989623


0.9468960016420045