In [None]:
import numpy as np
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
np.random.seed(2049)

## Load data

In [None]:
X = np.load("data/X_train_processed.npy")
pids, X = X[:, 0], X[:, 1:]
X_test = np.load("data/X_test_processed.npy")
test_idx, X_test = X_test[:, 0], X_test[:, 1:]
y = np.load("data/y.npy")

## Subtask 1

In [None]:
# The columns to predict for this subtask
predcols1 = np.arange(1, (10+1))
y_1 = y[:, predcols1]
print(y_1.shape)

In [None]:
X_train, X_val, y1_train, y1_val = train_test_split(X, y_1, test_size=0.2, random_state=2049)

In [None]:
r = GradientBoostingClassifier(n_estimators=50, max_depth=5, random_state=2049)
clf = MultiOutputClassifier(r, n_jobs=-1)
clf.fit(X_train, y1_train)

In [None]:
train_preds = np.array(clf.predict_proba(X_train))[:, :, 1].T
val_preds = np.array(clf.predict_proba(X_val))[:, :, 1].T
test_preds1 = np.array(clf.predict_proba(X_test))[:, :, 1].T

In [None]:
rocs = list()
for col_id in range(y1_val.shape[1]):
    roc = roc_auc_score(y1_val[:, col_id], val_preds[:, col_id])
    rocs.append(roc)
    print(roc)
print(f"Mean: {np.mean(rocs)}")

## Subtask 2

In [None]:
predcols2 = 11
y_2 = y[:, predcols2]

In [None]:
X_train, X_val, y2_train, y2_val = train_test_split(X, y_2, test_size=0.2, random_state=2049)

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=2049)
clf.fit(X_train, y2_train)

In [None]:
train_preds = np.array(clf.predict_proba(X_train))[:, 1].T
val_preds = np.array(clf.predict_proba(X_val))[:, 1].T
test_preds2 = np.array(clf.predict_proba(X_test))[:, 1].T

In [None]:
roc = roc_auc_score(y2_train, train_preds)
print(f"AUCROC train: {roc}")
roc = roc_auc_score(y2_val, val_preds)
print(f"AUCROC val: {roc}")

## Subtask 3

In [None]:
predcols3 = np.arange(12, (15+1))
y_3 = y[:, predcols3]

In [None]:
X_train, X_val, y3_train, y3_val = train_test_split(X, y_3, test_size=0.2, random_state=2049)

In [None]:
r = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=2049)
clf = MultiOutputRegressor(r, n_jobs=-1)
clf.fit(X_train, y3_train)

In [None]:
train_preds = clf.predict(X_train)
val_preds = clf.predict(X_val)
test_preds3 = clf.predict(X_test)

In [None]:
rs = list()
for col_id in range(y3_val.shape[1]):
    r = r2_score(y3_val[:, col_id], val_preds[:, col_id])
    rs.append(r)
    print(0.5+0.5*r)
print(f"Mean: {0.5 + 0.5*np.mean(rs)}")

## Saving results

In [None]:
test_preds = np.zeros((X_test.shape[0], y.shape[1]))
test_preds[:, 0] = test_idx.astype(int)
test_preds[:, predcols1] = test_preds1
test_preds[:, predcols2] = test_preds2
test_preds[:, predcols3] = test_preds3

In [None]:
temp_df = pd.read_csv('data/train_labels.csv', delimiter=',')
colnames = temp_df.columns

In [None]:
df = pd.DataFrame(test_preds, columns=colnames)
df['pid'] = df['pid'].astype(int)
df.to_csv('prediction.csv', index=False, float_format='%.3f')

In [None]:
zipper = zipfile.ZipFile('prediction.zip', 'w', zipfile.ZIP_DEFLATED)
zipper.write('prediction.csv')
zipper.close()