<a href="https://colab.research.google.com/github/al025/Machine-Learning-Study-Notes/blob/master/c1_chap4_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocessing
- encode categorical features numerically
- replace missing values with a reasonable estimated value
- scale features to avoid unduly influence of some features, and thus improve 
model performance
- use pipeline to chain preprocessing, training together

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# load data
filename = '/content/drive/My Drive/MachineLearning_DatacampCareerTrack/c1_sklearn_supervised/diabetes.csv'

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv(filename)
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [0]:
# in theses five columns, 0 represents missing values 
df.Glucose.replace(0, np.nan, inplace=True)
df.BloodPressure.replace(0, np.nan, inplace=True)
df.SkinThickness.replace(0, np.nan, inplace=True)
df.Insulin.replace(0, np.nan, inplace=True)
df.BMI.replace(0, np.nan, inplace=True)

X, y = df.drop('Outcome', axis=1).values, df.Outcome.values

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

parameters = {'SVM__C':[1, 10, 100],
        'SVM__gamma': [0.1, 0.01]}

steps = [('Imputation', SimpleImputer(np.nan, 'mean')), 
      ('scaler', StandardScaler()), 
      ('SVM', SVC())]
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters, cv=5)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

print("Accuracy: {}".format(cv.score(X_test, y_test)))
print('Classification Report:\n {}'.format(classification_report(y_test, y_pred)))
print('Tuned parameters: {}'.format(cv.best_params_))

Accuracy: 0.7662337662337663
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       206
           1       0.66      0.60      0.63       102

    accuracy                           0.77       308
   macro avg       0.74      0.72      0.73       308
weighted avg       0.76      0.77      0.76       308

Tuned parameters: {'SVM__C': 10, 'SVM__gamma': 0.01}


In [10]:
# we can compare model performance with unscaled data 
steps = [('Imputation', SimpleImputer(np.nan, 'mean')), 
      ('SVM', SVC())]
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters, cv=5)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

print("Accuracy: {}".format(cv.score(X_test, y_test)))
print('Classification Report:\n {}'.format(classification_report(y_test, y_pred)))
print('Tuned parameters: {}'.format(cv.best_params_))

Accuracy: 0.6883116883116883
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.95      0.80       206
           1       0.61      0.17      0.26       102

    accuracy                           0.69       308
   macro avg       0.65      0.56      0.53       308
weighted avg       0.67      0.69      0.62       308

Tuned parameters: {'SVM__C': 1, 'SVM__gamma': 0.01}
