# Exercise - Grid Search (Classification) 
- Answer the questions

In [1]:
# importing libraris
import pandas as pd 
import numpy as np 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report


In [2]:
# loading dataset
df = pd.read_csv('data/breast_cancer.csv')

In [3]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [5]:
df = df.drop(['Unnamed: 32','id'], axis= 1)

In [6]:
label_encoder = LabelEncoder()

In [7]:
# encoding and split prediction and predictor
X = df.drop(['diagnosis'], axis= 1)
y = df['diagnosis']
y_true = label_encoder.fit_transform(y)

In [8]:
y_true[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## Baseline Accuracy 

In [9]:
## Baseline accuracy for breast cancer data
total_count = len(y_true)
B_count = np.count_nonzero(y_true == 0)
print('Number of Benign Tumor', B_count)
print( 'Baseline Accuracy',B_count/total_count)

Number of Benign Tumor 357
Baseline Accuracy 0.6274165202108963


### Standardize the data 

In [16]:
#import library 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split  



In [17]:
scaler = StandardScaler()


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=10)


X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)


### Run Grid Search on KNeighborsClassfier
- Use at least 5 different n_neighbours 
- use weights 'distance' and 'uniform'
- use metrics 'manhattan', 'euclidean', 'minkowski'
- use f1 score as the scoring.
- also try using score='roc_auc' 

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
classifier = KNeighborsClassifier(n_neighbors =5)
classifier.fit(X_train_s, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [21]:
pred_train = classifier.predict(X_train_s)
acc = accuracy_score(y_train, pred_train)
print('accuracy', acc)

accuracy 0.9673366834170855


In [24]:
param_grid = {
    'n_neighbors' : [1, 2, 3, 4,5,6,7,8,10],
    'weights' : [ 'distance','uniform'],
    'metric' : ['manhattan', 'euclidean', 'minkowski','chebyshev']   
}

In [25]:
knn_1 = KNeighborsClassifier()

gs = GridSearchCV(knn_1, 
                  param_grid, 
                  cv=5,
                  scoring='accuracy'
                  )

In [26]:
gs.fit(X_train_s, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'metric': ['manhattan', 'euclidean', 'minkowski',
                                    'chebyshev'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 10],
                         'weights': ['distance', 'uniform']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [27]:
gs.best_estimator_


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='distance')

In [28]:
gs.best_params_

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}

In [29]:
gs.score(X_test_s, y_test)

0.9883040935672515

### Run Grid Search on Logistic Regression
- test "penalty": l1 and l2 
- test 5 different values of C
- use f1 score as the scoring.
- also try using score='roc_auc' 

In [31]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()

In [32]:
grid_log={
    "C":np.logspace(-3,3,7), 
    "penalty":["l1","l2"]
    }


In [34]:
logreg_cv=GridSearchCV(logreg,grid_log,cv=5)
logreg_cv.fit(X_train_s,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [36]:
logreg_cv.best_estimator_


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
logreg_cv.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [38]:
logreg_cv.score(X_test_s, y_test)

0.9824561403508771

## What model performs best ? What are the parameters ?
