In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import all_estimators

from src.cross_validation import compare_models
from src.data import preprocess_annotations

## Data loading

In [None]:
train_annotations = preprocess_annotations(
    pd.read_csv("data/trainset/trainset_true.csv")
)
test_annotations = preprocess_annotations(pd.read_csv("data/testset/testset_data.csv"))

In [None]:
train_class = pd.read_csv("data/train_class.csv")
test_class = pd.read_csv("data/test_class.csv")

train_dict = {}
test_dict = {}

for index, row in train_class.iterrows():
    if row["id"] in train_dict:
        train_dict[row["id"]].append(row["prediction"])
    else:
        train_dict[row["id"]] = [row["prediction"]]
for index, row in test_class.iterrows():
    if row["id"] in test_dict:
        test_dict[row["id"]].append(row["prediction"])
    else:
        test_dict[row["id"]] = [row["prediction"]]

train_annotations["CLASS"] = train_annotations["ID"].map(train_dict)
test_annotations["CLASS"] = test_annotations["ID"].map(test_dict)

In [None]:
NB_CLASS = int(np.max(train_class["prediction"]) + 1)
FEATURES = ["entropy", "variety"]


def class_list_to_feature(class_list, add_features):
    feature = np.zeros(NB_CLASS)
    for c in class_list:
        feature[c] += 1
    feature = feature / len(class_list)

    if "entropy" in add_features:
        feature = np.concatenate([feature, [-np.sum(feature * np.log(feature + 1e-3))]])
    if "variety" in add_features:
        feature = np.concatenate([feature, [len(np.unique(class_list))]])
    if "first_to_second" in add_features:
        count = np.bincount(class_list)
        max_count = np.max(count)
        second_max_count = np.max(count[count != max_count])
        feature = np.concatenate([feature, [max_count / second_max_count]])
    if "total" in add_features:
        feature = np.concatenate([feature, [len(class_list)]])

    return feature


train_annotations["CLASS"] = train_annotations["CLASS"].apply(
    lambda x: class_list_to_feature(x, FEATURES)
)
test_annotations["CLASS"] = test_annotations["CLASS"].apply(
    lambda x: class_list_to_feature(x, FEATURES)
)

train_annotations = pd.concat(
    [train_annotations, pd.DataFrame(train_annotations["CLASS"].tolist())], axis=1
).drop("CLASS", axis=1)
test_annotations = pd.concat(
    [test_annotations, pd.DataFrame(test_annotations["CLASS"].tolist())], axis=1
).drop("CLASS", axis=1)

In [None]:
x_train = train_annotations.drop(columns=["ID", "LABEL"]).to_numpy()
y_train = train_annotations["LABEL"].to_numpy().astype(int)
x_test = test_annotations.drop(columns=["ID", "LABEL"]).to_numpy()


print(f"Training samples {len(x_train)}, Test samples {len(x_test)}")

In [None]:
def add_outliers_feature(x, ref=None):
    if ref is None:
        ref = x
    new_x = np.zeros((x.shape[0]))
    for i in range(x.shape[1]):
        q = np.percentile(ref[:, i], 90)
        new_x += x[:, i] > q

    return np.concatenate([x, new_x.reshape(-1, 1)], axis=1)


x_train = add_outliers_feature(x_train)
x_test = add_outliers_feature(x_test, ref=x_train)

## Visualisation

In [None]:
features = (
    ["gender", "lymph_count", "age"]
    + [f"class {i+1}" for i in range(NB_CLASS)]
    + FEATURES
    + ["outliers"]
)

nb_features = len(features)
nb_cols = 4
nb_rows = nb_features // nb_cols + 1

figs, axs = plt.subplots(nb_rows, nb_cols, figsize=(20, 4 * nb_rows))
for i, col in enumerate(features):
    data = [x_train[y_train == 0][:, i], x_train[y_train == 1][:, i]]

    axs[i // nb_cols, i % nb_cols].boxplot(
        data,
        labels=["0", "1"],
        meanline=True,
        showmeans=True,
    )
    axs[i // nb_cols, i % nb_cols].set_title(col)

plt.show()

## Model selection

In [None]:
features_scaler = MinMaxScaler()
x_train = features_scaler.fit_transform(x_train)
x_test = features_scaler.transform(x_test)

In [None]:
all_models = []

filtered_models = [
    "ClassifierChain",
    "MultiOutputClassifier",
    "OneVsOneClassifier",
    "OneVsRestClassifier",
    "OutputCodeClassifier",
    "StackingClassifier",
    "VotingClassifier",
]

manually_filtered_models = ["CategoricalNB", "ComplementNB", "MultinomialNB"]

filtered_models += manually_filtered_models

for model_name, model_class in all_estimators(type_filter="classifier"):
    if not model_name in filtered_models:
        all_models.append((model_class(), model_name))

In [None]:
default_models_scores, default_labels = compare_models(
    all_models, x_train, y_train, k_fold=5
)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from src.config import CONFIG
from sklearn.naive_bayes import GaussianNB


x_a, x_b, y_a, y_b = train_test_split(
    x_train, y_train, test_size=0.2, random_state=CONFIG.SEED, stratify=y_train
)

model = GaussianNB()
model.fit(x_a, y_a)

print("train score", balanced_accuracy_score(y_a, model.predict(x_a)))
print("val score", balanced_accuracy_score(y_b, model.predict(x_b)))

In [None]:
from src.utils import save

model = GaussianNB()
model.fit(x_train, y_train)

print(model.score(x_train, y_train))

y_pred = model.predict(x_test)

save(y_pred, test_annotations["ID"].to_list())