In [1]:
import math
import warnings

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, f1_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier

from pathlib import Path

DATA_DIR = Path("/home/ancarey/kennedy/karuna_data/original")
warnings.filterwarnings("ignore")

In [4]:
def main(dataset, rs):
    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)
   
    # import pandas data train, test
    train_filename = f'{dataset}_original.csv'
    test_filename = f'{dataset}_original_test.csv'

    df = pd.read_csv(DATA_DIR / dataset/ train_filename)
    df = df.dropna(axis=0)
    if dataset == 'car':
        df['label'] = df['label'].map({0: 0, 1:1, 2:1, 3:1})
    if dataset == 'diabetes' or dataset == 'heart':
        df['label'] = df['label'].astype('bool')

    df_test = pd.read_csv(DATA_DIR / dataset/ test_filename)
    df_test = df_test.dropna(axis=0)
    if dataset == 'car':
        df_test['label'] = df_test['label'].map({0: 0, 1:1, 2:1, 3:1})
    if dataset == 'diabetes' or dataset == 'heart':
        df_test['label'] = df_test['label'].astype('bool')
        
    full_data = pd.concat([df, df_test], axis=0, ignore_index=True)
    target = full_data['label']
    data = full_data.drop(columns=['label'])
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=.2, random_state=rs)

    numerical_columns = numerical_columns_selector(X_train)
    categorical_columns = categorical_columns_selector(X_train)
    categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
    RF_preprocessor = ColumnTransformer([('one-hot-encoder', categorical_preprocessor, categorical_columns)],
                                       remainder="passthrough",
                                       )
    # LR
    # LR = make_pipeline(LR_preprocessor, LogisticRegression(max_iter=500))
    # LR.fit(X_train, y_train)
    # predictions = LR.predict(X_test)
    # acc = accuracy_score(y_test, predictions)
    # f1 = f1_score(y_test, predictions)
    # print('LR', round(acc, 3), round(f1,3))

    # # XGBOOST
    # GB = make_pipeline(GB_preprocessor, GradientBoostingClassifier())
    # GB.fit(X_train, y_train)
    # predictions = GB.predict(X_test)
    # acc = accuracy_score(y_test, predictions)
    # f1 = f1_score(y_test, predictions)
    # print('GB', round(acc, 3), round(f1,3))

    # #LGBM 
    # LGBM = make_pipeline(GB_preprocessor, LGBMClassifier(verbose=-1))
    # LGBM.fit(X_train, y_train)
    # predictions = LGBM.predict(X_test)
    # acc = accuracy_score(y_test, predictions)
    # f1 = f1_score(y_test, predictions)
    # print('LGBM', round(acc, 3), round(f1,3))

    #RF 
    RF = make_pipeline(RF_preprocessor, RandomForestClassifier())
    RF.fit(X_train, y_train)
    predictions = RF.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    print('RF', round(acc, 3), round(f1,3))

In [12]:
rs = [42, 456, 8543, 12345, 99999]
for r in rs:
    print(r)
    main('jungle', r)

42
RF 0.829 0.83
456
RF 0.818 0.82
8543
RF 0.826 0.826
12345
RF 0.828 0.833
99999
RF 0.825 0.827
