# Scikit-Learn Tutorial 

In this notebook, we will go through a simple classification example. 
Let's go!

Creator: Eugene Yang   <eugene@ir.cs.georgetown.edu>

In [1]:
# import essential packages
import numpy as np
import pandas as pd

In [2]:
# load dataset
from sklearn import datasets
boston_dataset = datasets.load_boston()

In [3]:
print(boston_dataset.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
# make the dataset into a pandas DataFrame
df = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names).assign(TARGET=boston_dataset.target)

In [5]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,TARGET
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.60,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.90,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.10,18.9


In [6]:
# Let's make it into a classification problem
# Suppose we are trying to classify them into two buckets: <30 and >=30
Y = df.TARGET >= 30
X = df.drop("TARGET", axis=1) # we don't want the target variable to be in our X

In [7]:
Y

0      False
1      False
2       True
3       True
4       True
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
476    False
477    False
478    False
479    False
480    False
481    False
482    False
483    False
484    False
485    False
486    False
487    False
488    False
489    False
490    False
491    False
492    False
493    False
494    False
495    False
496    False
497    False
498    False
499    False
500    False
501    False
502    False
503    False
504    False
505    False
Name: TARGET, Length: 506, dtype: bool

## Training-Testing Split

In [8]:
# Let's do a training/testing split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2 )

In [9]:
X_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
372,8.26725,0.0,18.10,1.0,0.6680,5.875,89.6,1.1296,24.0,666.0,20.2,347.88,8.88
12,0.09378,12.5,7.87,0.0,0.5240,5.889,39.0,5.4509,5.0,311.0,15.2,390.50,15.71
85,0.05735,0.0,4.49,0.0,0.4490,6.630,56.1,4.4377,3.0,247.0,18.5,392.30,6.53
266,0.78570,20.0,3.97,0.0,0.6470,7.014,84.6,2.1329,5.0,264.0,13.0,384.07,14.79
139,0.54452,0.0,21.89,0.0,0.6240,6.151,97.9,1.6687,4.0,437.0,21.2,396.90,18.46
504,0.10959,0.0,11.93,0.0,0.5730,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48
365,4.55587,0.0,18.10,0.0,0.7180,3.561,87.9,1.6132,24.0,666.0,20.2,354.70,7.12
216,0.04560,0.0,13.89,1.0,0.5500,5.888,56.0,3.1121,5.0,276.0,16.4,392.80,13.51
455,4.75237,0.0,18.10,0.0,0.7130,6.525,86.5,2.4358,24.0,666.0,20.2,50.92,18.13
442,5.66637,0.0,18.10,0.0,0.7400,6.219,100.0,2.0048,24.0,666.0,20.2,395.69,16.59


In [10]:
# Let's create a model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty="l2", C=10)

In [11]:
# Fit it!
model.fit( X_train, Y_train )

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Let's do a prediction on the testing data
y_pred = model.predict( X_test )

In [13]:
# Or we can also look at the probability of each classes
model.predict_proba( X_test )

array([[9.99998492e-01, 1.50756579e-06],
       [6.66171235e-01, 3.33828765e-01],
       [9.99704796e-01, 2.95204180e-04],
       [9.99127705e-01, 8.72295164e-04],
       [3.88517383e-01, 6.11482617e-01],
       [9.97983709e-01, 2.01629091e-03],
       [9.95872596e-01, 4.12740366e-03],
       [9.99794556e-01, 2.05444420e-04],
       [9.99902354e-01, 9.76461739e-05],
       [9.99943810e-01, 5.61901860e-05],
       [9.80206282e-01, 1.97937181e-02],
       [9.99739131e-01, 2.60869258e-04],
       [9.99994831e-01, 5.16871662e-06],
       [8.57231641e-01, 1.42768359e-01],
       [9.75554749e-01, 2.44452514e-02],
       [9.99583772e-01, 4.16227665e-04],
       [9.99999655e-01, 3.45357154e-07],
       [9.69836900e-01, 3.01630997e-02],
       [9.67779491e-01, 3.22205086e-02],
       [9.99857590e-01, 1.42409725e-04],
       [9.99983613e-01, 1.63874365e-05],
       [9.99998406e-01, 1.59377722e-06],
       [1.79604910e-01, 8.20395090e-01],
       [9.92919368e-01, 7.08063221e-03],
       [9.999999

In [14]:
# Let's look at some metrics
from sklearn.metrics import roc_auc_score
roc_auc_score( Y_test, y_pred )

0.8705882352941177

You can find more metrics at https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation

## 10-Fold Cross Validation

In [15]:
# There is a good CV wrapper in sklearn
from sklearn.model_selection import cross_val_score

In [16]:
# We need to make the target metric into a scorer
from sklearn.metrics import make_scorer
scorer = make_scorer( roc_auc_score, greater_is_better=True ) # if we are using loss/cost/penalty, lower would be better

In [17]:
cv_results = cross_val_score(model, X, Y, cv=10, scoring=scorer)

In [18]:
cv_results

array([0.76614987, 0.92118863, 0.77777778, 0.73015873, 0.90178571,
       0.91666667, 0.92559524, 0.70238095, 0.9375    , 0.625     ])

In [19]:
# mean of the cv results?
cv_results.mean()

0.8204203580657069

In [20]:
# standard deviation of the results?
cv_results.std()

0.10762925733242418