# sklearn and cuml model

In [1]:
import pandas as pd
import numpy as np
import cudf
from cuml.ensemble import RandomForestClassifier as curfc
from sklearn.ensemble import RandomForestClassifier as skrfc
from cuml.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('loan_approval_dataset.csv')

In [3]:
dataset.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


## data prepairing

In [4]:
input_cols = list(dataset.columns)[1:-1]
target_cols = ' loan_status'

In [5]:
X = dataset[input_cols].copy()
y = dataset[target_cols].copy()
print(X.shape, y.shape)

(4269, 11) (4269,)


In [6]:
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes('object').columns.tolist()

In [7]:
print(num_cols)
print(cat_cols)

[' no_of_dependents', ' income_annum', ' loan_amount', ' loan_term', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value']
[' education', ' self_employed']


In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(dataset[num_cols])

In [9]:
X[num_cols] = scaler.transform(X[num_cols]).astype(np.float32)

In [10]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,0.4,Graduate,No,0.969072,0.755102,0.555556,0.796667,0.085616,0.907216,0.575835,0.544218
1,0.0,Not Graduate,Yes,0.402062,0.303571,0.333333,0.195,0.09589,0.113402,0.218509,0.22449
2,0.6,Graduate,No,0.917526,0.75,1.0,0.343333,0.246575,0.231959,0.848329,0.870748
3,0.6,Graduate,No,0.824742,0.77551,0.333333,0.278333,0.626712,0.170103,0.59126,0.537415
4,1.0,Not Graduate,Yes,0.989691,0.609694,1.0,0.136667,0.428082,0.42268,0.748072,0.340136


In [11]:
X.dtypes

no_of_dependents            float32
education                    object
self_employed                object
income_annum                float32
loan_amount                 float32
loan_term                   float32
cibil_score                 float32
residential_assets_value    float32
commercial_assets_value     float32
luxury_assets_value         float32
bank_asset_value            float32
dtype: object

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for col in cat_cols:
    X[col] = encoder.fit_transform(X[col]).astype(np.int8)
y = encoder.fit_transform(y).astype(np.int8)

In [13]:
y

array([0, 1, 1, ..., 1, 0, 0], dtype=int8)

In [14]:
X

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,0.4,0,0,0.969072,0.755102,0.555556,0.796667,0.085616,0.907216,0.575835,0.544218
1,0.0,1,1,0.402062,0.303571,0.333333,0.195000,0.095890,0.113402,0.218509,0.224490
2,0.6,0,0,0.917526,0.750000,1.000000,0.343333,0.246575,0.231959,0.848329,0.870748
3,0.6,0,0,0.824742,0.775510,0.333333,0.278333,0.626712,0.170103,0.591260,0.537415
4,1.0,1,1,0.989691,0.609694,1.000000,0.136667,0.428082,0.422680,0.748072,0.340136
...,...,...,...,...,...,...,...,...,...,...,...
4264,1.0,0,1,0.082474,0.051020,0.555556,0.028333,0.099315,0.025773,0.077121,0.054422
4265,0.0,1,1,0.319588,0.280612,1.000000,0.431667,0.147260,0.149485,0.275064,0.129252
4266,0.4,1,0,0.649485,0.602041,0.888889,0.261667,0.044521,0.639175,0.457584,0.496599
4267,0.2,1,0,0.402062,0.318878,0.333333,0.800000,0.284247,0.036082,0.354756,0.394558


## data_split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [16]:
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (3415, 11)
Shape of X_test: (854, 11)
Shape of y_train: (3415,)
Shape of y_test: (854,)


In [17]:
X_cudf_train = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)

In [18]:
y_cudf_train = cudf.Series(y_train)

## training models

### sklearn

In [19]:
%%time
sk_model = skrfc(n_jobs=-1, 
                 random_state=42, 
                 n_estimators=260,
                 max_depth=16,
                 min_impurity_decrease=1e-5)

sk_model.fit(X_train, y_train)

CPU times: user 1.32 s, sys: 463 ms, total: 1.78 s
Wall time: 882 ms


In [20]:
%%time
sk_predict = sk_model.predict(X_test)
sk_acc = accuracy_score(y_test, sk_predict)

CPU times: user 89 ms, sys: 75.6 ms, total: 165 ms
Wall time: 134 ms


### cuml

In [21]:
%%time
cuml_model = curfc(
                   random_state=42, 
                   n_streams=1,
                   max_depth=15,
                   min_impurity_decrease=1e-6,
)

cuml_model.fit(X_cudf_train, y_cudf_train)

CPU times: user 495 ms, sys: 281 ms, total: 776 ms
Wall time: 882 ms


In [22]:
%%time
fil_preds_orig = cuml_model.predict(X_cudf_test)

fil_acc_orig = accuracy_score(y_test, fil_preds_orig)

CPU times: user 6.23 s, sys: 316 ms, total: 6.55 s
Wall time: 441 ms


### accs

In [23]:
print("SKL accuracy: %s" % sk_acc)
print("CUML accuracy before pickling: %s" % fil_acc_orig)

SKL accuracy: 0.9800936579704285
CUML accuracy before pickling: 0.9777517318725586
