# Model Selection & Evaluation

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

==========

# Model Selection

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()

In [None]:
print(iris.keys())

In [None]:
print(iris.DESCR)

In [None]:
X = iris.data
X

In [None]:
y = iris.target
y

In [None]:
iris_df = pd.DataFrame(X, columns=iris.feature_names)
iris_df

In [None]:
iris_df['target'] = iris.target

In [None]:
iris_df

In [None]:
X = iris_df.drop('target', axis=1).values
X

In [None]:
y = iris_df['target'].values
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# SVC -> Model # 1
from sklearn.svm import SVC
clf = SVC(C=10, kernel='poly', degree=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
# KNN -> Model # 2
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
# Logistic Regression -> Model # 3
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
# Random Forests -> Model # 4
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=150, max_depth=10, max_leaf_nodes=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

==========

# Model Evaluation & Improvement

## Splitter (Cross-Validation)

##### Importing Dataset

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
cancer = load_breast_cancer()

##### Selecting Model

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC()

### Cross-Validation with `train_test_split`

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
X = cancer.data
y = cancer.target

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
X.shape

In [None]:
X_train.shape

In [None]:
X_valid.shape

In [None]:
X_test.shape

In [None]:
# training (class)
clf.fit(X_train, y_train)

##### Evaluating Model

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# validation (midterm)
y_pred_valid = clf.predict(X_valid)
y_pred_valid

In [None]:
y_valid

In [None]:
print(accuracy_score(y_valid, y_pred_valid))

In [None]:
# testing (final)
y_pred_test = clf.predict(X_test)
y_pred_test

In [None]:
y_test

In [None]:
print(accuracy_score(y_test, y_pred_test))

### Cross-Validation with `cross_val_score`

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.1, random_state=42)

In [None]:
scores = cross_val_score(clf, X_train, y_train)
scores

In [None]:
scores = cross_val_score(clf, cancer.data, cancer.target, cv=3)
scores

In [None]:
scores.mean()

In [None]:
# clf.score(y_test, clf.predict(X_test))

### Cross-Validation with `cross_validate`

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html

In [None]:
from sklearn.model_selection import cross_validate

Scoring Options: https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
scores = cross_validate(clf, X, y, scoring=['accuracy','average_precision'],cv=10)
scores

In [None]:
pd.DataFrame(scores)#['test_score']

In [None]:
pd.DataFrame(scores)['test_accuracy'].mean()

In [None]:
pd.DataFrame(scores)['test_average_precision'].mean()

==========

## Hyperparameters Tuning (Grid Search)

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
param_grid

In [None]:
from sklearn.svm import SVC

In [None]:
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_

In [None]:
pd.DataFrame(grid_search.cv_results_)

==========

# Algorithms Chain & Pipeline

### Traditional Pipeline

In [None]:
# loading data
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [None]:
# splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

In [None]:
# scaling data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
# building model
from sklearn.svm import SVC
svm = SVC()

In [None]:
# learn an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

In [None]:
# scale the test data and score the scaled data
X_test_scaled = scaler.transform(X_test)

In [None]:
svm.score(X_test_scaled, y_test)

### Building Pipeline

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

### Using Pipelines in Grid Searches

In [None]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

In [None]:
grid.score(X_test, y_test)

In [None]:
grid.best_params_

==========

# THANK YOU!