In [76]:
import pandas as pd
import numpy as np
import cupy as cp
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator
from cuml.ensemble import RandomForestClassifier
import cudf
import cuml
from cuml.metrics import accuracy_score

cp.random.seed(42)
np.random.seed(42)


In [75]:
X_train = pd.read_parquet('../data/interim/X_train.parquet')
X_test = pd.read_parquet('../data/interim/X_test.parquet')
y_train = pd.read_parquet('../data/interim/y_train.parquet')
y_test = pd.read_parquet('../data/interim/y_test.parquet')

X_train_ind = X_train[X_train['is_independent'] == 1].copy()
X_test_ind = X_test[X_test['is_independent'] == 1].copy()
y_train_ind = y_train[y_train['is_independent'] == 1].copy()
y_test_ind = y_test[y_test['is_independent'] == 1].copy()

dataframes = [X_train, X_test, y_train, y_test, X_train_ind, X_test_ind, y_train_ind, y_test_ind]
for df in dataframes:
    df.drop('is_independent', axis=1, inplace=True)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
y_train_ind = np.ravel(y_train_ind)
y_test_ind = np.ravel(y_test_ind)

cross_validation = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [48]:
def sk_model(param_grid, model, cv, independent=False):

    X_train_local = X_train_ind if independent else X_train
    y_train_local = y_train_ind if independent else y_train
    X_test_local = X_test_ind if independent else X_test
    y_test_local = y_test_ind if independent else y_test
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
    grid_search.fit(X_train_local, y_train_local)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_local)
    report = classification_report(y_test_local, y_pred)
    print(f'best params: {grid_search.best_params_}')
    print(report)

## naive bayes<a id='naive bayes'></a>

In [7]:
param_grid = {}

model = GaussianNB()
sk_model(param_grid=param_grid,
        model=model,
        cv=5,
        independent=True
)

best params: {}
              precision    recall  f1-score   support

       False       0.72      0.08      0.14      6118
        True       0.39      0.95      0.55      3772

    accuracy                           0.41      9890
   macro avg       0.56      0.51      0.34      9890
weighted avg       0.60      0.41      0.30      9890



## logistic regression<a id='logistic_regression'></a>

In [72]:
X_train_cudf = cudf.DataFrame.from_pandas(X_train_ind)
y_train_cudf = cudf.Series(y_train_ind)
X_train_cp = cp.asarray(X_train_cudf.to_pandas().values)
y_train_cp = cp.asarray(y_train_cudf.to_pandas().values)
    
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1','l2']
}

cuml_model = cuml.LogisticRegression(solver='qn')
grid_search = GridSearchCV(cuml_model, param_grid, cv=cross_validation)
grid_search.fit(X_train_cp.get(), y_train_cp.get())

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_ind)
report = classification_report(y_test_ind, y_pred)
print(f'best params: {grid_search.best_params_}')
print(report)


best params: {'C': 0.01, 'penalty': 'l2'}
              precision    recall  f1-score   support

       False       0.66      0.90      0.76      6118
        True       0.59      0.24      0.34      3772

    accuracy                           0.65      9890
   macro avg       0.62      0.57      0.55      9890
weighted avg       0.63      0.65      0.60      9890



## random forest<a id='random_forest'></a>

In [79]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestClassifier()
grid_search = GridSearchCV(model, param_grid, cv=cross_validation)
grid_search.fit(X_train_cp.get(), y_train_cp.get())

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_ind)
report = classification_report(y_test_ind, y_pred)
print(f'best params: {grid_search.best_params_}')
print(report)

  ret = func(*args, **kwargs)


TypeError: an integer is required