In [1]:
import matplotlib.pyplot as plt
from sklearn import datasets, svm
import pandas as pd
import numpy as np

In [2]:
cancer=datasets.load_breast_cancer()

In [3]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
cancer.data[:1]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01]])

In [7]:
cancer.target[:5]

array([0, 0, 0, 0, 0])

In [8]:
x=cancer.data
y=cancer.target

In [9]:
y[y==0].size

212

In [10]:
y[y==1].size

357

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.3)

In [13]:
svm_=svm.SVC(kernel='linear')
svm_.fit(x_train, y_train)

SVC(kernel='linear')

In [15]:
y_pred=svm_.predict(x_test)

In [16]:
from sklearn import metrics
metrics.accuracy_score(y_pred, y_test)

0.9590643274853801

In [17]:
print(metrics.confusion_matrix(y_pred, y_test))

[[ 57   2]
 [  5 107]]


In [18]:
print(metrics.classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94        59
           1       0.98      0.96      0.97       112

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [19]:
svm_.score(x_train, y_train)

0.9698492462311558

In [20]:
svm_.score(x_test, y_test)

0.9590643274853801

In [21]:
#  good model overall
# can we do better?

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [27]:
from sklearn.model_selection import cross_val_score

In [30]:
models=[LogisticRegression(solver='liblinear'), RandomForestClassifier(n_estimators=200), SVC(kernel='linear'), GaussianNB()]
CV=5
#cv_df=pd.DataFrame(index=range(CV*len(models)))
entries=[]
for m in models:
    name=m.__class__.__name__
    acc=cross_val_score(m,x,y, scoring='accuracy', cv=CV)
    print(acc)
    entries.append([name,acc.mean()])

[0.92982456 0.93859649 0.97368421 0.94736842 0.96460177]
[0.92982456 0.93859649 0.99122807 0.97368421 0.97345133]
[0.94736842 0.92982456 0.97368421 0.92105263 0.95575221]
[0.92105263 0.92105263 0.94736842 0.94736842 0.95575221]


In [32]:
cv_df=pd.DataFrame(entries, columns=['names','acc'])
cv_df

Unnamed: 0,names,acc
0,LogisticRegression,0.950815
1,RandomForestClassifier,0.961357
2,SVC,0.945536
3,GaussianNB,0.938519


In [33]:
#select Logisticregression

In [34]:
LR=LogisticRegression(solver='liblinear')
LR.fit(x_train, y_train)
y_pred=LR.predict(x_test)

In [36]:
metrics.accuracy_score(y_pred, y_test)

0.9473684210526315