In [1]:
import sklearn
sklearn.__version__

'1.1.3'

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold, GridSearchCV
from sklearn.model_selection import train_test_split

In [4]:
%%time

DATA_DIR = "../project_dataset"
TRAIN_DIR = f"{DATA_DIR}/partial_dataset_train"

# first_n = -1
# first_n = 30_000
first_n = 250_000

X_ = pd.read_csv(f"{TRAIN_DIR}/features.csv", index_col=0)
y_ = pd.read_csv(f"{TRAIN_DIR}/labels.csv", index_col=0)
cl_df_ = pd.read_csv(f"{TRAIN_DIR}/complete_labels.csv", index_col=0)

if first_n > 0:
    X = X_[:first_n]
    y = y_[:first_n]
    cl_df = cl_df_[:first_n]
else:
    X = X_
    y = y_
    cl_df = cl_df_

CPU times: total: 1min 35s
Wall time: 1min 35s


In [5]:
if "group_id" in cl_df:
    cl_df.drop("group_id", axis=1)

cl_df["group_id"] = cl_df.astype(bool).groupby(cl_df.columns.tolist(), sort=False).ngroup() + 1
min_ = cl_df["group_id"].min()
max_ = cl_df["group_id"].max()

def f(r):
    if r["label"] == False:
        r["group_id"] = np.random.randint(min_, max_, size=1)[0]
    return r["group_id"]

group_ids = cl_df[["label", "group_id"]].apply(f, axis=1)

In [6]:
X = X.values#.astype(np.float32)
y = LabelEncoder().fit_transform(y.values.squeeze())#.astype(np.uint8)
groups = group_ids.to_numpy()#.astype(np.uint16)

del X_, y_, cl_df, cl_df_

## Undersampling

In [7]:
from imblearn.under_sampling import *
from collections import Counter

In [8]:
# undersample = EditedNearestNeighbours(sampling_strategy="majority",
#                                       n_neighbors=11,
#                                       kind_sel="all",
#                                       n_jobs=-1) # 7 < k < 11

undersample = OneSidedSelection(sampling_strategy="majority", n_neighbors=3, n_seeds_S=100, n_jobs=-1) # 3 < k < 11

In [9]:
%%time
# undersample = RepeatedEditedNearestNeighbours(sampling_strategy="majority", n_jobs=-1) #not good enough
# undersample = NeighbourhoodCleaningRule(sampling_strategy="majority", n_jobs=-1, threshold_cleaning=0.2)
# undersample = TomekLinks(sampling_strategy="majority", n_jobs=-1) # not much reduction

# undersample = NearMiss(sampling_strategy=1/35, n_jobs=-1) # ratio of 1/25 to 1/50 works well on valid set
X_res, y_res = undersample.fit_resample(X, y)
print('Original dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 246238, 1: 3762})
Resampled dataset shape Counter({0: 239656, 1: 3762})
CPU times: total: 2h 22min 1s
Wall time: 8min 2s


In [10]:
X = X_res
y = y_res
groups = groups[undersample.sample_indices_]

del X_res, y_res

## Getting groups for each protein in the dataset for KFold

## Define K fold

In [11]:
# n_splits = 5
# cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True)

## Building pipes for each ML algorithm
We'll be using grid search. The following code was just a trial on how to use pipes in sklearn.

```python
n_components = 10

pipe_lr = Pipeline(steps=[
        ("lr_scaler", StandardScaler()),
        ("lr_dim_reduce", PCA(n_components=n_components)),
        ("lr_clf", LinearRegression(n_jobs=-1))]
)

pipe_rf = Pipeline(steps=[
        ("rf_scaler", StandardScaler()),
        ("rf_dim_reduce", PCA(n_components=n_components)),
        ("rf_clf", RandomForestClassifier(n_jobs=-1))]
)

pipe_svm = Pipeline(steps=[
        ("svm_scaler", StandardScaler()),
        ("svm_dim_reduce", PCA(n_components=n_components)),
        ("svm_clf", SVC())]
)

pipelines = {
    "Linear Regression": pipe_lr,
    "Random Forest": pipe_rf,
    "Support Vector Machine": pipe_svm,   
}

scores = {key: [] for key in pipelines.keys()}

for train_id, test_id in cv.split(X, y, groups):

    X_train = X[train_id]
    y_train = y[train_id]
    X_test = X[test_id]
    y_test = y[test_id]

    for clf_name, pipe in pipelines.items():

        pipe.fit(X_train, y_train)
        s = pipe.score(X_test, y_test)
        scores[clf_name].append(round(s, 3))

    print("#", end="")
print("\n")

pd.DataFrame(scores)
```

## Combining Cross validation and all the pipes in GridSearchCV
This takes time. ALOT OF TIME! So, only choose the right algorithms and parameters for our problem

In [12]:
def get_metrics(y_true, y_pred):
    from sklearn.metrics import balanced_accuracy_score, precision_score, roc_auc_score, f1_score
    return {
        "b_acc": round(balanced_accuracy_score(y_true, y_pred), 2),
        "prec": round(precision_score(y_true, y_pred), 2),
        "f1": round(f1_score(y_true, y_pred), 2),
        "roc": round(roc_auc_score(y_true, y_pred), 2)
    }

In [13]:
    # # Logistic Regression
    # {
    #     "dim_reduce__n_components": np.arange(5, 16, 5),
    #     "clf": [LogisticRegression()],
    #     "clf__penalty": ["l2"],
    #     "clf__C": np.logspace(0, 4, 5),
    #     "clf__solver": ["newton-cg", "saga", "sag", "liblinear"]
    # },
    # # Random Forests
    # {
    #     "scaler": [RobustScaler()],
    #     "scaler__unit_variance": [True, False],
    #     "dim_reduce": [SelectPercentile()],
    #     "dim_reduce__percentile": np.arange(10, 51, 10),
    #     "clf": [RandomForestClassifier()],
    #     "clf__n_estimators": np.arange(100, 201, 50),
    #     "clf__criterion": ["gini", "entropy", "log_loss"],
    #     "clf__max_features": ["sqrt", "log2"],
    #     "clf__class_weight": ["balanced", "balanced_subsample"]
    # },
    # # Support Vector Machine
    # {
    #     "pca__n_components": [2, 20],
    #     "clf": [SVC()],
    #     "clf__C": np.logspace(0, 4, 3),
    #     "clf__kernel": ["poly"],
    #     "clf__degree": [3],
    #     "clf__class_weight": [None, "balanced"],
    # }
       # "clf__base_estimator": [#LogisticRegression(class_weight="balanced",
        #                                            #max_iter=1500, C=0.18, solver="saga")],
                                # SVC(class_weight="balanced", kernel="poly")],
        # "clf__n_estimators": [30],
        # "clf__learning_rate": [0.1, 1],
        # "clf__algorithm": ["SAMME", "SAMME.R"]

default_pipe = Pipeline(steps=[
        ("scaler", PowerTransformer()),
        ("dim_reduce", PCA(n_components=20)),
        ("clf", LogisticRegression(class_weight="balanced", max_iter=2500, C=0.18, solver="saga"))
])

param_grid = [
    {
        "clf": [LogisticRegression(class_weight="balanced", max_iter=1500, C=0.18, solver="saga")],
    }
]



In [14]:
%%time
run_gridcv = True
n_splits = 5
cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True)


if run_gridcv:
    grid_search = GridSearchCV(estimator=default_pipe,
                               param_grid=param_grid,
                               cv=cv.split(X, y, groups),
                               scoring=["balanced_accuracy", "precision", "f1", "roc_auc"],
                               refit="roc_auc",
                               error_score="raise",
                               n_jobs=-1,
                               verbose=4,
                    )
    grid_clf = grid_search.fit(X, y)
else:
    best_bacc = 0
    best_roc = 0
    best_model = None
    mean = {"b_acc": 0, "prec": 0, "f1": 0, "roc": 0}
    from copy import deepcopy
    for i, (train_id, test_id) in enumerate(cv.split(X, y, groups)):

        X_train, X_test = X[train_id], X[test_id]
        y_train, y_test = y[train_id], y[test_id]
        
        model = deepcopy(default_pipe)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        tmp = get_metrics(y_test, y_pred)
        
        print(i+1, end="")
        for k, v in tmp.items():
            mean[k] += v
            print(f"\t{k}: {round(v, 3)}", end="")
        print()
        
        if best_roc < tmp["roc"]:
            best_bacc = tmp["b_acc"]
            best_roc = tmp["roc"]
            best_model = model
        else:
            del model
    print("mean", end="")
    for k, v in mean.items():
        print(f"\t{k}: {round(v/n_splits, 3)}", end="")
    print()
    #refit the best model on the entire dataset
    best_model.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
CPU times: total: 4min 27s
Wall time: 9min 9s


In [15]:
grid_clf.best_estimator_

In [16]:
if run_gridcv:
    tmp = pd.DataFrame(grid_clf.cv_results_)
tmp[["rank_test_balanced_accuracy", "mean_test_balanced_accuracy", "mean_test_f1", "mean_test_roc_auc"]].sort_values(["rank_test_balanced_accuracy"])

Unnamed: 0,rank_test_balanced_accuracy,mean_test_balanced_accuracy,mean_test_f1,mean_test_roc_auc
0,1,0.687942,0.069266,0.744352


In [17]:
import pickle
if run_gridcv:
    acc = round(max(grid_clf.cv_results_["mean_test_balanced_accuracy"]), 2)
    roc = round(max(grid_clf.cv_results_["mean_test_roc_auc"]), 2)
    usample_name = str(undersample.__class__).split(".")[-1][:-2]
    algo_name = str(grid_clf.best_estimator_.steps[2][1].__class__).split(".")[-1][:-2]
    filename = f"../models/{usample_name}_best_model_{first_n}_{algo_name}_{acc}_{roc}.pkl"
    pickle.dump(grid_clf.best_estimator_, open(filename, 'wb'))
else:
    acc = round(mean["b_acc"]/n_splits, 2)
    roc = round(mean["roc"]/n_splits, 2)
    algo_name = str(best_model.steps[2][1].__class__).split(".")[-1][:-2]
    filename = f"../models/best_model_{first_n}_{algo_name}_{acc}_{roc}.pkl"
    pickle.dump(best_model, open(filename, 'wb'))
    best_model