<a href="https://colab.research.google.com/github/cagBRT/Machine-Learning/blob/master/Pipelines_for_Ensembles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using Pipelines for Ensemble Learning

In [None]:
# Clone the entire repo.
!git clone -s https://github.com/cagBRT/Machine-Learning.git cloned-repo
%cd cloned-repo

# Import the libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import VotingClassifier

from sklearn.compose import ColumnTransformer

# Get the data

In [None]:
df = pd.read_csv('train.csv')
df = df[['Survived','Pclass','Sex','Age','Fare','Embarked']]
X = df.iloc[:,1:]
y = df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  test_size = 0.3,
                                                  stratify = y,
                                                  random_state = 0)

# Create custom transformers

In [None]:
# custom transformer to select specific columns
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self._feature_names = feature_names

    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        return X[self._feature_names]

# define the transformer for numeric columns
# for 'Age' and 'Fare'
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# define the transformer for categorical columns
# for 'Sex' and 'Embarked'
categorical_transformer1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# define the transformer for categorical columns
# for 'Pclass'
categorical_transformer2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])


# Use the column transformer to transform all the required columns

In [None]:
features_preprocessor = ColumnTransformer(
    transformers=[
        ('numeric',     numeric_transformer,       ['Age','Fare']),
        ('categorical1', categorical_transformer1, ['Sex',
                                                    'Embarked']),
        ('categorical2', categorical_transformer2, ['Pclass'])
    ], remainder='passthrough')

# Find the best Estimator using GridSerchCV

In [None]:
# the list of classifiers to use
# use random_state for reproducibility
classifiers = [
    LogisticRegression(random_state=0),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=0)
]

In [None]:
# parameter grids for the various classifiers
logregress_parameters = {
    'classifier__penalty' : ['l1','l2'],
    'classifier__C'       : np.logspace(-3,3,7),
    'classifier__solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}
knn_parameters = {
    'classifier__n_neighbors': np.arange(1, 25, 2)
}
randomforest_parameters = {
    'classifier__n_estimators': [50, 100, 200, 300]
}


In [None]:
# stores all the parameters in a list
parameters = [
    logregress_parameters,
    knn_parameters,
    randomforest_parameters
]

In [None]:
estimators = []

In [None]:
# iterate through each classifier and use GridSearchCV
for i, classifier in enumerate(classifiers):
    # create a Pipeline object
    pipe = Pipeline(steps=[
        ('preprocessor', features_preprocessor),
        ('classifier', classifier)
    ])
    clf = GridSearchCV(pipe,              # model
              param_grid = parameters[i], # hyperparameters
              scoring='accuracy',         # metric for scoring
              cv=10)                      # number of folds
    clf.fit(X, y)
    print("Tuned Hyperparameters :", clf.best_params_)
    print("Accuracy :", clf.best_score_)
    # add the clf to the estimators list
    estimators.append((classifier.__class__.__name__, clf))

In [None]:
ensemble = VotingClassifier(estimators, voting='hard')

In [None]:
#ensemble = VotingClassifier(estimators, voting='hard')  # default is
                                                        # 'hard'


In [None]:
#ensemble = VotingClassifier(estimators, voting='soft')


In [None]:
#ensemble = VotingClassifier(estimators,
#                            voting='soft',
#                            weights=[1,1,1])  # n-estimators

In [None]:
ensemble.fit(X_train, y_train)

In [None]:
ensemble.score(X_test, y_test)

In [None]:
# test data for 2 passengers
test_data = {
    'Pclass'   : [2,1],
    'Sex'      : ['male','female'],
    'Age'      : [35,15],
    'Fare'     : [90,20],
    'Embarked' : ['S','Q']
}
ensemble.predict(pd.DataFrame.from_dict(test_data))
# array([0, 1])

**Assignment:**
1. Try the other voting methods
2. Select two passsengers from the test.csv file. Use the ensemble to predict their survival. Compare the prediction to the actual values in test.csv