In [14]:
# todo
# make sure random forest is giving consistent results
# add svm
# add gradient boosted machine (or adaboost)
# add a neural network
# clean up the rapids code
# add the explanations

## contents
* [introduction](#introduction)
* [imports, load and model tester](#imports)
* [naive bayes](#naive_bayes')
* [logistic regression](#logistic_regression)
* [random forest](#random_forest)
* [support vector machine](#svm)
* [gradient boosting](#gradient_boosting)
* [neural network](#)
* [model selection and conclusion](#conclusion)

## imports, load and model tester<a id='imports'></a>

In [25]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

from cuml import LogisticRegression
from cuml import SVC 


In [16]:
X_train = pd.read_parquet('../data/interim/X_train.parquet')
X_test = pd.read_parquet('../data/interim/X_test.parquet')
y_train = pd.read_parquet('../data/interim/y_train.parquet')
y_test = pd.read_parquet('../data/interim/y_test.parquet')

X_train_ind = X_train[X_train['is_independent'] == 1].copy()
X_test_ind = X_test[X_test['is_independent'] == 1].copy()
y_train_ind = y_train[y_train['is_independent'] == 1].copy()
y_test_ind = y_test[y_test['is_independent'] == 1].copy()

dataframes = [X_train, X_test, y_train, y_test, X_train_ind, X_test_ind, y_train_ind, y_test_ind]
for df in dataframes:
    df.drop('is_independent', axis=1, inplace=True)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
y_train_ind = np.ravel(y_train_ind)
y_test_ind = np.ravel(y_test_ind)

cross_validation = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [17]:
def sk_model(param_grid, model, cv, independent=False):

    X_train_local = X_train_ind if independent else X_train
    y_train_local = y_train_ind if independent else y_train
    X_test_local = X_test_ind if independent else X_test
    y_test_local = y_test_ind if independent else y_test
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
    grid_search.fit(X_train_local, y_train_local)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_local)
    report = classification_report(y_test_local, y_pred)
    print(f'best params: {grid_search.best_params_}')
    print(report)

## naive bayes<a id='naive_bayes'></a>

In [18]:
param_grid = {}

model = GaussianNB()
sk_model(param_grid=param_grid,
        model=model,
        cv=5,
        independent=True
)

best params: {}
              precision    recall  f1-score   support

       False       0.72      0.08      0.14      6118
        True       0.39      0.95      0.55      3772

    accuracy                           0.41      9890
   macro avg       0.56      0.51      0.34      9890
weighted avg       0.60      0.41      0.30      9890



## logistic regression<a id='logistic_regression'></a>

In [19]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1','l2']
}

model = cuml.LogisticRegression(solver='qn')
sk_model(param_grid=param_grid,
         model=model,
         cv=cross_validation,
         independent=True)

best params: {'C': 0.01, 'penalty': 'l2'}
              precision    recall  f1-score   support

       False       0.66      0.90      0.76      6118
        True       0.59      0.24      0.34      3772

    accuracy                           0.65      9890
   macro avg       0.62      0.57      0.55      9890
weighted avg       0.63      0.65      0.60      9890



## random forest<a id='random_forest'></a>

In [20]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestClassifier()
sk_model(param_grid=param_grid,
         model=model,
         cv=cross_validation,
         independent=True)

  ret = func(*args, **kwargs)


best params: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

       False       0.65      0.93      0.76      6118
        True       0.62      0.20      0.30      3772

    accuracy                           0.65      9890
   macro avg       0.64      0.56      0.53      9890
weighted avg       0.64      0.65      0.59      9890



## support vector machine<a id='svm'></a>

In [27]:
svc = SVC()

svc.get_param_names()

['handle',
 'verbose',
 'output_type',
 'C',
 'kernel',
 'degree',
 'gamma',
 'coef0',
 'tol',
 'cache_size',
 'max_iter',
 'nochange_steps',
 'probability',
 'random_state',
 'class_weight',
 'multiclass_strategy']

In [29]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['rbf', 'poly']
}

model = SVC()

sk_model(param_grid=param_grid,
         model=model,
         cv=cross_validation,
         independent=False
)


In [None]:
## gradient boosting<a id='gradient_boosting'></a>

In [None]:
from cuml.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(X_train_ind, y_train_ind)