## Loading and training with sklearn

* Loading the dataset
* Training a classification model
* Making predictions

In [41]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

In [3]:
diabetes = datasets.load_diabetes()

In [6]:
# What do we have?

dir(diabetes)

['DESCR', 'data', 'feature_names', 'target']

In [7]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [5]:
diabetes.data

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ..., 
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [14]:
x = diabetes.data
y = diabetes.target

## Assign to Train / Test

In [25]:
x.shape, y.shape


((442, 10), (442,))

In [19]:
# Assign to train and test sets

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

In [24]:
x_train.shape, x_test.shape

((265, 10), (177, 10))

### Train the model

In [26]:
model = KNeighborsClassifier(n_neighbors=1)

In [27]:
model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

## Make predictions

In [34]:
# Get the first array from test
first = x_test[0]
first

array([ 0.01991321,  0.05068012,  0.10480869,  0.07007254, -0.03596778,
       -0.0266789 , -0.02499266, -0.00259226,  0.00371174,  0.04034337])

In [31]:
# The real target is

y_test[0]

321.0

In [45]:
model.predict(x_test[0].reshape(1,-1)) # Making a prediction for only one record

array([ 336.])

In [49]:
model.predict(first.reshape(1,-1)) - y_test[0]

array([ 15.])

In [65]:
total_count = 0
total_sum = 0
for count,value in enumerate(x_test):
    total_count += 1
    real = y_test[count]
    predicted = model.predict(x_test[count].reshape(1,-1))
    diff = real-predicted
    print("diff: "+str(diff[0]))
print("Total count: "+str(total_count))



diff: -15.0
diff: -5.0
diff: -98.0
diff: -29.0
diff: -118.0
diff: 147.0
diff: 83.0
diff: -43.0
diff: 47.0
diff: -211.0
diff: 121.0
diff: 52.0
diff: -49.0
diff: 11.0
diff: -68.0
diff: 6.0
diff: -58.0
diff: 12.0
diff: 2.0
diff: -113.0
diff: -82.0
diff: 188.0
diff: -115.0
diff: -74.0
diff: -161.0
diff: -134.0
diff: -67.0
diff: 45.0
diff: 25.0
diff: 74.0
diff: 32.0
diff: 139.0
diff: 24.0
diff: -6.0
diff: 152.0
diff: 16.0
diff: -63.0
diff: 180.0
diff: -39.0
diff: 89.0
diff: -64.0
diff: 72.0
diff: -167.0
diff: -67.0
diff: 122.0
diff: 24.0
diff: 66.0
diff: 69.0
diff: -27.0
diff: -26.0
diff: -69.0
diff: 66.0
diff: -103.0
diff: 20.0
diff: -85.0
diff: -10.0
diff: 1.0
diff: -89.0
diff: 163.0
diff: -30.0
diff: -258.0
diff: 96.0
diff: 4.0
diff: -4.0
diff: -133.0
diff: 189.0
diff: -12.0
diff: 49.0
diff: -92.0
diff: -15.0
diff: 14.0
diff: -32.0
diff: 35.0
diff: -51.0
diff: -120.0
diff: 56.0
diff: -12.0
diff: -86.0
diff: -152.0
diff: 216.0
diff: 74.0
diff: -14.0
diff: -5.0
diff: -14.0
diff: -12.0
diff

In [39]:
test_predicts = model.predict(x_test) # Making a prediction for all records in test

In [40]:
test_predicts - y_test


array([  15.,    5.,   98.,   29.,  118., -147.,  -83.,   43.,  -47.,
        211., -121.,  -52.,   49.,  -11.,   68.,   -6.,   58.,  -12.,
         -2.,  113.,   82., -188.,  115.,   74.,  161.,  134.,   67.,
        -45.,  -25.,  -74.,  -32., -139.,  -24.,    6., -152.,  -16.,
         63., -180.,   39.,  -89.,   64.,  -72.,  167.,   67., -122.,
        -24.,  -66.,  -69.,   27.,   26.,   69.,  -66.,  103.,  -20.,
         85.,   10.,   -1.,   89., -163.,   30.,  258.,  -96.,   -4.,
          4.,  133., -189.,   12.,  -49.,   92.,   15.,  -14.,   32.,
        -35.,   51.,  120.,  -56.,   12.,   86.,  152., -216.,  -74.,
         14.,    5.,   14.,   12.,  104.,  -43.,  -16.,   35.,  -20.,
        -52.,   -6.,    1., -100.,  -56.,   76.,   24., -128.,  -62.,
        -13., -119.,   46.,   44.,  -85.,  112.,   17.,  -28.,    9.,
         76., -107.,  -49.,   -9.,   43.,   22.,  -59.,  -10.,    0.,
         28., -130.,  114.,   32.,  128.,  -69.,   -7.,  -47.,   60.,
        -97.,   98.,

In [42]:
np.std(test_predicts - y_test)

81.777309178442238