<a href="https://colab.research.google.com/github/a-Imantha/simple-ml/blob/main/SVM%20with%20Hyperparameter%20tuning%20for%20Breast%20Cancer%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression for WBCD DB



## Importing the initial libraries and doing configurations

In [None]:
import numpy as np
import pandas as pd
import sys
np.set_printoptions(threshold=sys.maxsize)

## Importing the dataset
Dataset Download link: [click here](https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data)

Dataset Download Page: [click here](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))

In [None]:
dataset_original = pd.read_csv('/content/breast-cancer-wisconsin.data')

## Trying the Suggested Approaches

### Approach 01: By Removing the rows with missing values

In [None]:
dataset = dataset_original.replace('?',pd.NaT).dropna()

X = dataset.iloc[:, 1:-1].values.astype(int)
y = dataset.iloc[:, -1].values

#### Splitting the dataset into the Training set(65%) and Test set(35%)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state = 0)

#### Evaluating the Best Hyperparameters for SVM Model through a Grid Search

In [None]:
parameters = {'kernel':('linear', 'rbf','poly'), 'C':[0.1, 1, 10]}

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC(class_weight={2:1,4:3})
clf = GridSearchCV(svc, parameters, cv = 10, scoring = 'accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight={2: 1, 4: 3}, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10],
                         'kernel': ('linear', 'rbf', 'poly')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
print (clf.best_score_)
print (clf.best_params_)
print (clf.best_estimator_)

0.9684343434343434
{'C': 1, 'kernel': 'rbf'}
SVC(C=1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 3}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


As the previous Evaluation leads to Kernal rbf, moving forward with rbf with other parameters.

In [None]:
parameters = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001,'scale','auto'], 
              'kernel': ['rbf']}  

In [None]:
svc = SVC(class_weight={2:1,4:2})
clf = GridSearchCV(svc, parameters, cv = 10, scoring = 'accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight={2: 1, 4: 2}, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale',
                                   'auto'],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
print (clf.best_score_)
print (clf.best_params_)
print (clf.best_estimator_)

0.9729292929292928
{'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=0.1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 2},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.0001,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


#### Training the Model with the best params 

In [None]:

classifier = SVC(C=0.1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 2},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.0001,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
classifier.fit(X_train, y_train)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 2},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.0001,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

#### Printing the Prediction against the trained model for the Test set for comparison

In [None]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

#### Confusion Matrix on the test results

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[153   2]
 [  2  82]]


0.9832635983263598

In [None]:
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,2,4,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,153,2,155
4,2,82,84
All,155,84,239


### Approach 02: By replacing the missing values with 1

In [None]:
from sklearn.impute import SimpleImputer
dataset = dataset_original

X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

imputer = SimpleImputer(missing_values='?', strategy='constant', fill_value=1)
imputer.fit(X)
X = imputer.transform(X).astype(int)

#### Splitting the dataset into the Training set(65%) and Test set(35%)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state = 0)

#### Evaluating the best Hyperparameters for SVM Model through a Grid Search

In [None]:
parameters = {'kernel':('linear', 'rbf','poly'), 'C':[0.1, 1, 10]}

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC(class_weight={2:1,4:3})
clf = GridSearchCV(svc, parameters, cv = 10, scoring = 'accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight={2: 1, 4: 3}, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10],
                         'kernel': ('linear', 'rbf', 'poly')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
print (clf.best_score_)
print (clf.best_params_)
print (clf.best_estimator_)

0.971304347826087
{'C': 0.1, 'kernel': 'poly'}
SVC(C=0.1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 3},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale',
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


As the previous Evaluation leads to Kernal rbf, moving forward with rbf with other parameters.

In [None]:
parameters = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001,'scale','auto'], 
              'kernel': ['poly']}  

In [None]:
svc = SVC(class_weight={2:1,4:3})
clf = GridSearchCV(svc, parameters, cv = 10, scoring = 'accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight={2: 1, 4: 3}, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale',
                                   'auto'],
                         'kernel': ['poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
print (clf.best_score_)
print (clf.best_params_)
print (clf.best_estimator_)

0.9735265700483092
{'C': 0.1, 'gamma': 0.01, 'kernel': 'poly'}
SVC(C=0.1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 3},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01,
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


#### Training the model with best_params

In [None]:
classifier = SVC(C=0.1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 3},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01,
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

classifier.fit(X_train, y_train)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight={2: 1, 4: 3},
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01,
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

#### Printing the Prediction against the trained model for the Test set for comparison

In [None]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

#### Confusion Matrix on the test results

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[150   6]
 [  2  87]]


0.9673469387755103

In [None]:
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,2,4,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,150,6,156
4,2,87,89
All,152,93,245
