## Imports
- sklearn.datasets - load popular datasets and artificial data generators
- sklearn.model_selection - such as cross validation and hyper-parameter tuning
- sklearn.preprocessing - change raw features into a suitable representation
- sklearn.neighbors - neighbors-based learning methods

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split    # a splitter in model_selection
from sklearn.preprocessing import StandardScaler        # standardize features by removing the mean and scaling to unit variance.
from sklearn.neighbors import KNeighborsClassifier

### Loading and examining datasets
- sklearn.datasets -> load_breast_cancer

In [18]:
data = load_breast_cancer(as_frame=True).frame
# bunch - dict-like object
print(type(data))
data

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [21]:
X, y = load_breast_cancer(return_X_y=True)
# data2 = load...
# (data, target) tuple
print(type(X))
print(len(X))
print(len(y))
print(X)
print(y)

<class 'numpy.ndarray'>
569
569
[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 

### Splitting dataset
- sklearn.model_selection -> train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_test

array([[1.361e+01, 2.469e+01, 8.776e+01, ..., 1.329e-01, 3.470e-01,
        7.900e-02],
       [1.566e+01, 2.320e+01, 1.102e+02, ..., 2.462e-01, 3.277e-01,
        1.019e-01],
       [1.495e+01, 1.757e+01, 9.685e+01, ..., 1.667e-01, 3.414e-01,
        7.147e-02],
       ...,
       [1.239e+01, 1.748e+01, 8.064e+01, ..., 9.804e-02, 2.819e-01,
        1.118e-01],
       [1.194e+01, 2.076e+01, 7.787e+01, ..., 1.155e-01, 2.465e-01,
        9.981e-02],
       [1.294e+01, 1.617e+01, 8.318e+01, ..., 8.388e-02, 3.297e-01,
        7.834e-02]], shape=(114, 30))

### Scaling dataset
- sklearn.preprocessing -> StandardScaler

In [25]:
scaler = StandardScaler()       # create new scaler object
X_train_scaled = scaler.fit_transform(X_train)
print(X_train)
print(X_train_scaled)

[[2.018e+01 1.954e+01 1.338e+02 ... 2.173e-01 3.032e-01 8.075e-02]
 [1.051e+01 2.309e+01 6.685e+01 ... 3.125e-02 2.227e-01 6.777e-02]
 [1.785e+01 1.323e+01 1.146e+02 ... 8.341e-02 1.783e-01 5.871e-02]
 ...
 [1.846e+01 1.852e+01 1.211e+02 ... 1.642e-01 3.695e-01 8.579e-02]
 [1.532e+01 1.727e+01 1.032e+02 ... 2.229e-01 3.258e-01 1.191e-01]
 [1.328e+01 1.372e+01 8.579e+01 ... 9.173e-02 2.736e-01 7.320e-02]]
[[ 1.682258    0.03737057  1.6870217  ...  1.56576051  0.20974713
  -0.17180979]
 [-1.02162223  0.87042074 -1.02826555 ... -1.26549382 -1.07244541
  -0.89552964]
 [ 1.03075428 -1.44334677  0.90832842 ... -0.47173824 -1.77964229
  -1.40068386]
 ...
 [ 1.20131963 -0.20198469  1.17194854 ...  0.75770029  1.2657641
   0.10920316]
 [ 0.3233275  -0.49531221  0.44597928 ...  1.65097967  0.56971671
   1.96645341]
 [-0.24708777 -1.32836238 -0.26011707 ... -0.34512692 -0.26171746
  -0.59277164]]


In [26]:
X_test_scaled = scaler.transform(X_test)
print(X_test)
print(X_test_scaled)

[[1.361e+01 2.469e+01 8.776e+01 ... 1.329e-01 3.470e-01 7.900e-02]
 [1.566e+01 2.320e+01 1.102e+02 ... 2.462e-01 3.277e-01 1.019e-01]
 [1.495e+01 1.757e+01 9.685e+01 ... 1.667e-01 3.414e-01 7.147e-02]
 ...
 [1.239e+01 1.748e+01 8.064e+01 ... 9.804e-02 2.819e-01 1.118e-01]
 [1.194e+01 2.076e+01 7.787e+01 ... 1.155e-01 2.465e-01 9.981e-02]
 [1.294e+01 1.617e+01 8.318e+01 ... 8.388e-02 3.297e-01 7.834e-02]]
[[-0.15481471  1.24587997 -0.1802199  ...  0.28138607  0.9073873
  -0.26938373]
 [ 0.41839671  0.89623356  0.72987787 ...  2.00555223  0.59997964
   1.00744097]
 [ 0.21986983 -0.42491361  0.1884427  ...  0.79574455  0.81819129
  -0.68923045]
 ...
 [-0.49594541 -0.44603319 -0.46898532 ... -0.24910319 -0.12951624
   1.55943069]
 [-0.62177231  0.32365824 -0.58132805 ...  0.01659797 -0.6933624
   0.89090981]
 [-0.34215698 -0.75344044 -0.36597069 ... -0.46458592  0.63183536
  -0.30618304]]
