In [7]:
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import pickle

# Breast cancer dataset

In [21]:
data = datasets.load_breast_cancer(as_frame=True)
print(data['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [29]:
data['data']

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [31]:
# split the dataset
X = data['data']
y = data['frame'].target
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)

In [None]:
# uczymt na uczacym
# testujemy na uczacym i testowym - ok

In [49]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

### SVM for area, smoothness

In [71]:
X_train_area_smooth = X_train[['mean area', 'mean smoothness']]
X_test_area_smooth = X_test[['mean area', 'mean smoothness']]

#### without scaling

In [72]:
svm_clf_without_scaling = Pipeline([("linear_svc", LinearSVC(C=1, loss="hinge"))])

In [92]:
# train the model
svm_clf_without_scaling.fit(X_train_area_smooth, y_train)



Pipeline(steps=[('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [93]:
y_test_no_scaling_pred = svm_clf_without_scaling.predict(X_test_area_smooth)
y_train_no_scaling_pred = svm_clf_without_scaling.predict(X_train_area_smooth)

In [94]:
test_no_scaling_acc = accuracy_score(y_test, y_test_no_scaling_pred)
train_no_scaling_acc = accuracy_score(y_train, y_train_no_scaling_pred)
print('test_no_scaling_acc', test_no_scaling_acc)
print('train_no_scaling_acc', train_no_scaling_acc)

test_no_scaling_acc 0.6578947368421053
train_no_scaling_acc 0.6197802197802198


### with scaling

In [95]:
svm_clf_with_scaling = Pipeline([("scaler", StandardScaler()),
                                    ("linear_svc", LinearSVC(C=1,
                                                             loss="hinge"))])

In [96]:
# train the model
svm_clf_with_scaling.fit(X_train_area_smooth, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [97]:
y_test_scaling_pred = svm_clf_with_scaling.predict(X_test_area_smooth)
y_train_scaling_pred = svm_clf_with_scaling.predict(X_train_area_smooth)

In [98]:
test_scaling_acc = accuracy_score(y_test, y_test_scaling_pred)
train_scaling_acc = accuracy_score(y_train, y_train_scaling_pred)
print('test_scaling_acc', test_scaling_acc)
print('train_scaling_acc', train_scaling_acc)

test_scaling_acc 0.9210526315789473
train_scaling_acc 0.9032967032967033


### save data in the pickle

# Iris dataset

In [110]:
data2 = datasets.load_iris(as_frame=True)
print(data2['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [111]:
# split the dataset
X2 = data['data']
y2 = data['frame'].target
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2)

#### SVM for sepal length, width - gatunek VIRGINICA (dla tego jednego gatunku predykcje robimy) !!!

In [112]:
X2

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [113]:
X2_train_len_wid = X2_train[['petal length (cm)', 'petal width (cm)']]
X2_test_len_wid = X2_test[['petal length (cm)', 'petal width (cm)']]

#### without scaling

In [114]:
svm_clf2_without_scaling = Pipeline([("linear_svc", LinearSVC(C=1, loss="hinge"))])

In [115]:
# train the model
svm_clf2_without_scaling.fit(X2_train_len_wid, y2_train)



Pipeline(steps=[('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [116]:
y_test2_no_scaling_pred = svm_clf2_without_scaling.predict(X2_test_len_wid)
y_train2_no_scaling_pred = svm_clf2_without_scaling.predict(X2_train_len_wid)

In [119]:
test2_no_scaling_acc = accuracy_score(y2_test, y_test2_no_scaling_pred)
train2_no_scaling_acc = accuracy_score(y2_train, y_train2_no_scaling_pred)
print('test2_no_scaling_acc', test2_no_scaling_acc)
print('y_train2_no_scaling_pred', train2_no_scaling_acc)

test2_no_scaling_acc 0.7
y_train2_no_scaling_pred 0.7583333333333333


### with scaling

In [95]:
svm_clf2_with_scaling = Pipeline([("scaler", StandardScaler()),
                                    ("linear_svc", LinearSVC(C=1,
                                                             loss="hinge"))])

In [96]:
# train the model
svm_clf2_with_scaling.fit(X2_train_area_smooth, y2_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [97]:
y_test_scaling_pred = svm_clf_with_scaling.predict(X_test_area_smooth)
y_train_scaling_pred = svm_clf_with_scaling.predict(X_train_area_smooth)

In [98]:
test2_scaling_acc = accuracy_score(y2_test, y_test_scaling_pred)
train2_scaling_acc = accuracy_score(y2_train, y_train_scaling_pred)
print('test_scaling_acc', test2_scaling_acc)
print('train_scaling_acc', train2_scaling_acc)

test_scaling_acc 0.9210526315789473
train_scaling_acc 0.9032967032967033


### save data in the pickle