In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
np.random.seed(2049)

## Load data

In [None]:
X = np.load("data/X_processed.npy")
X_test = np.load("data/X_processed_test.npy")
y = np.load("data/y.npy")

## Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

## Classify

In [None]:
weights = np.unique(y, return_counts=True) # checking class imbalance
weights
active = weights[1][1]
inactive = weights[1][0]
total = y.shape[0]

In [None]:
weight_for_0 = (1 / inactive)*(total)/2.0 
weight_for_1 = (1 / active)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
class_weight

#### RFs

In [None]:
# params = {'n_estimators': [50, 100, 500],
#           'max_depth': [4, 5, 7]
#          }

# rf = RandomForestClassifier(random_state=2049, class_weight=class_weight)
# gcv = GridSearchCV(rf, param_grid=params, n_jobs=1, verbose = 15, scoring='f1')
# gcv.fit(X_train, y_train)

# rf_best = RandomForestClassifier(**gcv.best_params_, random_state=2049, class_weight=class_weight)
# rf.fit(X_train, y_train)


In [None]:
# preds_train = rf.predict(X_train)
# preds_val = rf.predict(X_val)
# preds_test = rf.predict(X_test)

In [None]:
# print(f"Train F1: {f1_score(y_train, preds_train)}")
# print(f"Val F1: {f1_score(y_val, preds_val)}")

#### XGB

In [None]:
# class imbalance 
scale_pos_weight = np.sqrt(inactive/active) 
scale_pos_weight

In [None]:
# xgb.set_config(verbosity=2)
# xgb_model = xgb.XGBClassifier()
# parameters = {'objective':['binary:logistic'],
#               'learning_rate': [0.1, 0.5, 0.7],
#               'max_depth': [4, 7, 10],
#               'n_estimators': [500, 1000, 3000], 
#               'seed': [2049]}


# clf = GridSearchCV(xgb_model, parameters, n_jobs=1, 
#                    cv=5, 
#                    scoring='f1',
#                    verbose=5)

# clf.fit(X_train, y_train)

In [None]:
# clf.best_params_

In [None]:
xgb_model = xgb.XGBClassifier(learning_rate = 0.7, max_depth = 7, n_estimators = 1000, 
                              objective = 'binary:logistic', scale_pos_weight=scale_pos_weight, seed = 2049)
xgb_model.fit(X_train, y_train)

In [None]:
preds_train = xgb_model.predict(X_train)
preds_val = xgb_model.predict(X_val)
preds_test = xgb_model.predict(X_test)

## Performance

In [None]:
print(f"Train F1: {f1_score(y_train, preds_train)}")
print(f"Val F1: {f1_score(y_val, preds_val)}")

## Saving results

In [None]:
df = pd.DataFrame(preds_test, columns=['y'])
df = df.astype({'y': 'int32'})
df.to_csv('data/test_preds.csv', index=False, header=False)