# Learn Python Programming Meetup : Parallel Processing

### Usecase: Building Machine Learning Model in Parallel

In [None]:
! pip3 install pandas sklearn

In [1]:
import time
from multiprocessing import Pool

import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.metrics import roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              ExtraTreesClassifier, GradientBoostingClassifier)

In [2]:
def process_data_for_modeling(filepath):
    df = pd.read_csv(filepath, low_memory=False)
    gender_list = df.SEX.sample(10**4).map({'M':1, 'F':0})
    df = df.rename(columns={"FRACE6": "Fathers_Race", "MRAVE6": "Mothers_Race"})
    df['DBWT'] = df.DBWT.apply(lambda x: 0 if x > 2500 else 1)
    df_ml = df.sample(2*10**4)
    df_ml = df_ml.drop(columns=['FRACE15', 'Fathers_Race', 'MRACE15', 'Mothers_Race'])
    df_ml = pd.get_dummies(df_ml, columns=['ATTEND', 'BFACIL', 'DMAR', 'FEDUC', 'FHISPX', 
                               'FRACE31', 'IMP_SEX', 'IP_GON', 'LD_INDL', 
                               'MAGE_IMPFLG', 'MAR_IMP', 'MBSTATE_REC', 'MEDUC', 
                               'MHISPX', 'MM_AICU', 'MRACE31', 'MRACEIMP', 
                               'MTRAN', 'NO_INFEC', 'NO_MMORB', 'NO_RISKS', 
                               'PAY', 'PAY_REC', 'PRECARE', 'RDMETH_REC', 
                               'RESTATUS', 'RF_CESAR', 'RF_CESARN', 'SEX'], 
                   drop_first=True)
    return df_ml

In [3]:
def find_logloss(model):
    print("Starting Compute: {:32}".format(model.__name__))
    clf = model()
    clf.fit(X_train, y_train)
    y_out = clf.predict(X_test)
    error = log_loss(y_test, y_out)
    print("Computed Ended for model {:32} & LogLoss Error Score: {:.2f}"
          .format(model.__name__, error))

In [4]:
def compute_in_parallel(models):
    pool = Pool(4)
    pool.map(find_logloss, models)
    pool.close()
    pool.join()

In [5]:
def compute_in_sequence(models):
    for model in models:
        find_logloss(model)

In [6]:
# Download dataset and place in same directory where you start Jupyter Notebook
# dataset link: https://www.kaggle.com/des137/us-births-2018
dataset_filepath='US_births(2018).csv'

In [7]:
models = [RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, 
          GradientBoostingClassifier]
df = process_data_for_modeling(dataset_filepath)

X = df.drop(columns='DBWT')
y = df.DBWT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [8]:
start = time.time()
compute_in_parallel(models)
end = time.time()
print(end-start)

Starting Compute: RandomForestClassifier          
Starting Compute: AdaBoostClassifier              
Starting Compute: GradientBoostingClassifier      
Starting Compute: ExtraTreesClassifier            
Computed Ended for model AdaBoostClassifier               & LogLoss Error Score: 2.68
Computed Ended for model RandomForestClassifier           & LogLoss Error Score: 2.73
Computed Ended for model ExtraTreesClassifier             & LogLoss Error Score: 2.80
Computed Ended for model GradientBoostingClassifier       & LogLoss Error Score: 2.66
5.53488302230835


In [9]:
start = time.time()
compute_in_sequence(models)
end = time.time()
print(end-start)

Starting Compute: RandomForestClassifier          
Computed Ended for model RandomForestClassifier           & LogLoss Error Score: 2.73
Starting Compute: AdaBoostClassifier              
Computed Ended for model AdaBoostClassifier               & LogLoss Error Score: 2.68
Starting Compute: ExtraTreesClassifier            
Computed Ended for model ExtraTreesClassifier             & LogLoss Error Score: 2.82
Starting Compute: GradientBoostingClassifier      
Computed Ended for model GradientBoostingClassifier       & LogLoss Error Score: 2.66
10.400805950164795


### Ref:
+ https://www.kaggle.com/des137/us-births-2018
+ https://www.kaggle.com/des137/usbirths