<a href="https://colab.research.google.com/github/bademirci/machine_learning/blob/main/classifiers_and_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Batuhan Demirci


Use cross-validation to select the best method and the best set of parameters.
Try Regularized Logistic Regression (both L1 and L2 penalties and different C
values), KNN classifier (different numbers of neighbors you believe to be
reasonable), random forests (different numbers of trees and different numbers of
features to select at each split of your selection) and gradient boosting classifier
(different numbers of trees and learning rates of your selection).

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('HW3DataSet.csv')
df

In [None]:
x = df.drop("y", axis = 1)
y = df["y"]

##(a) Split the data set into a training set and a test set (80% Training, 20% Test)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1, stratify=y)


##(b) Standardize your features.

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(x_train)
x_trainstandard = scaler.transform(x_train)
x_teststandard = scaler.transform(x_test)

##(b) Use cross-validation to select the best method and the best set of parameters.

###Regularized Logistic Regression

In [None]:
#Finding the best set of parameters.

regressor=LogisticRegression(solver='liblinear')
C_param_range = [0.001,0.01,0.1,1,10,100,1000,10000]
penalties=['l1','l2']
# create grid
params = {
 'Regressor__C': C_param_range,
 'Regressor__penalty': penalties,
 }
pipe = Pipeline([('scaler',preprocessing.StandardScaler()),('Regressor', regressor)])

clf_grid = GridSearchCV(estimator = pipe, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
clf_grid.fit(x_trainstandard, y_train)
# print results
print(clf_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'Regressor__C': 1, 'Regressor__penalty': 'l1'}


In [None]:
#Finding cross v. scores.

logreg = LogisticRegression(C=1,penalty='l1',max_iter=1000,solver='liblinear')
pipe = make_pipeline(preprocessing.StandardScaler(), logreg)
scoreCV = cross_val_score(pipe, x_train, y_train, scoring='accuracy',
                             cv=KFold(n_splits=10, shuffle=True,
                                            random_state=1))
scoreCV
np.mean(scoreCV)

0.8341666666666667

### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)

CVAccuracy=[]
for j in range(1,26):
    knn = KNeighborsClassifier(n_neighbors = j)
    scores = cross_val_score(knn, x_trainstandard, y_train, cv=10, scoring='accuracy')
    CVAccuracy.append([scores.mean(),j])
df = pd.DataFrame (CVAccuracy,columns=['Validation Accuracy','NeighbourSize'])

print(df.sort_values(by="Validation Accuracy",ascending=False))

max_index = df["Validation Accuracy"].idxmax()
print("-------------------------------------------")
print("Best neighboursize is",df.loc[max_index,'NeighbourSize'],"with",df.loc[max_index,'Validation Accuracy'],"score.")

    Validation Accuracy  NeighbourSize
10             0.842833             11
6              0.838833              7
8              0.838667              9
7              0.834833              8
9              0.834500             10
12             0.834333             13
5              0.826500              6
11             0.826333             12
13             0.826167             14
23             0.822000             24
20             0.822000             21
18             0.822000             19
14             0.822000             15
15             0.818000             16
19             0.818000             20
24             0.818000             25
2              0.818000              3
21             0.817667             22
16             0.814000             17
17             0.814000             18
22             0.813833             23
4              0.805833              5
3              0.805667              4
1              0.801500              2
0              0.764500  

###Random Forests

In [None]:
#Finding the best set of parameters.

clf=RandomForestClassifier(random_state=0)

n_estimators = [100,200,300,400,500]

max_features = [3,4,5,6,7,8,9,10,11,12]
params = {
 'Classifier__n_estimators': n_estimators,
 'Classifier__max_features': max_features,
 }
#Pipeline approach.
pipe = Pipeline([('scaler',preprocessing.StandardScaler()),('Classifier', clf)])

cv = KFold(n_splits=5, random_state=1, shuffle=True)

clf_grid = GridSearchCV(estimator = pipe, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)

clf_grid.fit(x_train, y_train)

print(clf_grid.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'Classifier__max_features': 3, 'Classifier__n_estimators': 500}


In [None]:
#Finding cv scores.

x_standard = scaler.transform(x)

print(clf_grid.best_params_)
cv = KFold(n_splits=5, random_state=1, shuffle=True)
CVErrors=[]
for train_index, validation_index in cv.split(x_trainstandard):
    X_train, X_val = x_standard[train_index], x_standard[validation_index], 
    Y_train, y_val = y.iloc[train_index], y.iloc[validation_index]
    clf=RandomForestClassifier(random_state=0,n_estimators=500,max_features=3)
    clf.fit(X_train,Y_train)
    y_pred=clf.predict(X_val)
    CVErrors.append(accuracy_score(y_val, y_pred))
print(np.mean(CVErrors), " is the Valudation score.")

{'Classifier__max_features': 3, 'Classifier__n_estimators': 500}
0.8353551912568307  is the Valudation score.


### Gradient boosting classifier

In [None]:
#Finding best parameters.

model = GradientBoostingClassifier(max_depth=1)
n_estimators = [100,500,1000,5000]
learning_rate = [1,0.1,0.01,0.001]
params = {'Classifier__n_estimators': n_estimators,
            'Classifier__learning_rate': learning_rate,
         }

#Pipeline approach.
pipe = Pipeline([('scaler',preprocessing.StandardScaler()),('Classifier', model)])

boost_grid = GridSearchCV(estimator = pipe, param_grid = params, 
                            cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
boost_grid.fit(x_train, y_train)
print(boost_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'Classifier__learning_rate': 0.01, 'Classifier__n_estimators': 500}


In [None]:
#Finding cv scores.

cv = KFold(n_splits=5, random_state=1, shuffle=True)
CVErrors=[]
for train_index, validation_index in cv.split(x_standard):
    X_train, X_val = x_standard[train_index], x_standard[validation_index], 
    Y_train, y_val = y.iloc[train_index], y.iloc[validation_index]
    model = GradientBoostingClassifier(max_depth=1, n_estimators=500, learning_rate=0.01)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_val)
    CV = accuracy_score(y_pred,y_val)
    CVErrors.append(CV)
print("Gradient Boosting CV Score: ", np.mean(CVErrors))

Gradient Boosting CV Score:  0.8453005464480874


## (c) and (d) Once you decide on the final method and the set of best parameters, refit your model on the standardized training set and evaluate the performance (accuracy) on the standardized test set. Provide confusion matrix

In [None]:
#I decided final method as a Gradient Boosting with {'learning_rate': 0.01, 'n_estimators': 500}.

final_method = GradientBoostingClassifier(max_depth=1, n_estimators=500, learning_rate=0.01)
final_method.fit(x_trainstandard,y_train)
y_pred = final_method.predict(x_teststandard)

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ")
print(cnf_matrix)

print('Accuracy:', accuracy_score(y_test, y_pred))

Confusion Matrix: 
[[24  4]
 [ 6 27]]
Accuracy: 0.8360655737704918


In [None]:
y_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0])