#  Pipelines w/ Scikit-Learn

In [None]:
#install needed libraries
#!pip3 install catboost
#!pip3 install sklearn_pandas

In [None]:
#import needed libraries
import numpy as np
import pandas as pd
import random
#library to make dummy classification data
from sklearn.datasets import make_classification
#library to impute missing numerical data
from sklearn.impute import SimpleImputer
#library to reduce dimensions and speed up machine learning algorithm training
from sklearn.decomposition import PCA
#libraries for algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#roc and auc scoring 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
#pipeline concatenates multiple processes to run sequentially (ex. preprocessing, modeling, evaluation)
from sklearn.pipeline import Pipeline
#for saving and loading pipelines
from joblib import dump,load
#scaling and categorical encoding
from sklearn.preprocessing import StandardScaler, OneHotEncoder
#library to map columns to transformations before passing them into models for training
from sklearn_pandas import DataFrameMapper

## Dummy Dataset Creation

In [None]:
categorical_features = ['feat_5', 'feat_6', 'feat_7', 'feat_8']
numerical_features = ['feat_1', 'feat_2', 'feat_3', 'feat_4']

#make_classification creates normally distributed data with a std of 1
X, y = make_classification(n_samples=10000, 
                           n_features=4, #numeric
                           n_redundant=0, #no redudant features
                           random_state=42, 
                           weights=[0.5]) #50/50 split of positive and negative values

#add categorical columns
for col in range(4):
    num_classes = np.random.randint(2, 10) #each categorical column's number of classes is a random number between 2 and 10
    cat_col = np.random.randint(num_classes, size=X.shape[0]).reshape(-1,1) #creates entire column of random values between 0 and the number of classes
    X = np.hstack((X, cat_col)) #appends each categorical feature column to the dataset

#to dataframe
columns = [f'feat_{i+1}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=columns)
y = pd.DataFrame(y, columns=['label'])

#scale regressors, modify categoricals (shift the mean and std of values to reflect real dataset values)
for col in numerical_features:
    mean = np.random.randint(10, 1000)
    std = np.random.randint(1, 100)
    X[col] = X[col].apply(lambda x: mean + std * x).astype(int)

#create string value categorical features
for col in categorical_features:
    X[col] = X[col].apply(lambda x: f'str_{x}' if np.isnan(x)==False else x)

#create nans in dataset (30% for each feature)
for col in categorical_features + numerical_features:
    X[col] = X[col].sample(frac=0.7)

#create final dataframe
df = X.merge(y,left_index=True, right_index=True)

In [None]:
df.sample(5)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df['label']
X_test, y_test = test_df[categorical_features + numerical_features], test_df['label']

## Preprocessing and Training

In [None]:
#map transformations to columns
cat = [([c], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OneHotEncoder()]) for c in categorical_features]
num = [([n], [SimpleImputer(), StandardScaler()]) for n in numerical_features] #imputes the mean of the column to replace nans and scales the column
mapper = DataFrameMapper(num + cat, df_out=True) #df_out=True returns a dataframe
lr_classifier = LogisticRegression()
dt_classifier = DecisionTreeClassifier()
rf_classifier = RandomForestClassifier()

pipeline_lr = Pipeline([
    ('preprocess', mapper),
    ('lr_classifier', lr_classifier)
])

pipeline_dt = Pipeline([
    ('preprocess', mapper),
    ('dt_classifier', dt_classifier)
])

pipeline_rf = Pipeline([
    ('preprocess', mapper),
    ('rf_classifier', rf_classifier)
])

In [None]:
#list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf]

In [None]:
#performance evaluation
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [None]:
#dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

#fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [None]:
#apply the mapper to transform features
preprocessed_X_test = mapper.transform(X_test)

In [None]:
#pre-transformed features transposed
X_test[numerical_features + categorical_features].head().T

In [None]:
#post-transformed features transposed
preprocessed_X_test.head().T

In [None]:
#create a function to evaluate the auc
def evaluation(pipeline, X, y):
    y_predict_proba = pipeline.predict_proba(X)[:, 1]
    return{
        'auc': roc_auc_score(y, y_predict_proba)
    }

In [None]:
y_predict_proba = pipeline_lr.predict_proba(X)[:, 1]
pred = pd.DataFrame(y_predict_proba)
pred.head()

In [None]:
#auc scores for training values
print(evaluation(pipeline_lr, X_train, y_train))
print(evaluation(pipeline_dt, X_train, y_train))
print(evaluation(pipeline_rf, X_train, y_train))

In [None]:
#auc scores for test values
print(evaluation(pipeline_lr, X_test, y_test))
print(evaluation(pipeline_dt, X_test, y_test))
print(evaluation(pipeline_rf, X_test, y_test))

In [None]:
#R-squared of classifiers
for i, model in enumerate (pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test,y_test)))

In [None]:
#best classifier
for i, model in enumerate(pipelines):
    best_accuracy=model.score(X_test,y_test)
    best_pipeline=model
    best_classifier=i
print('Classifier with best accuracy: {}'.format(pipe_dict[best_classifier]))

In [None]:
#save the pipelines for loading in other notebooks
dump(pipeline_lr, '../pipelines/pipeline_lr.joblib')
dump(pipeline_dt, '../pipelines/pipeline_dt.joblib')
dump(pipeline_rf, '../pipelines/pipeline_rf.joblib')

#save the test dataframe of dummy data for later usage (if needed)
test_df.to_csv('../pipelines/test_df.csv')

In [None]:
#load a pipeline
lr = load('../pipelines/pipeline_lr.joblib')

In [None]:
#test the pipeline
lr.score(X_test, y_test)