## Model Creation on Chunks

In [3]:
train_len = 295246830
one_len = 1589906
zero_len = 293656924
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786,
         'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132,
         'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468,
         'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

In [4]:
df00 = pd.read_parquet('zero_features.parquet/part-00000-9b293261-abd9-4e5b-8250-966e1884fdf4-c000.snappy.parquet')
df00

Unnamed: 0,id,protein,a1,a2,a3,a4,a5,a6,a7,a8,...,d16,d17,d18,d19,d20,d21,d22,d23,d24,y
0,77492497,2,9,3,2,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,285394928,3,7,2,9,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,204662315,3,4,4,6,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,95945486,3,8,9,4,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,190725420,1,4,8,6,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595958,200187295,2,5,2,5,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1595959,257742281,3,7,3,2,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1595960,181920989,3,4,0,8,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1595961,258866218,2,7,3,2,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df1 = pd.read_parquet('one_features.parquet')
df1

Unnamed: 0,id,protein,a1,a2,a3,a4,a5,a6,a7,a8,...,d16,d17,d18,d19,d20,d21,d22,d23,d24,y
0,86000086,2,10,1,2,10,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,86000579,3,10,1,13,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,86000585,3,10,1,12,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,86002603,2,10,0,3,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,86002715,3,10,0,13,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1589901,154997740,2,5,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1589902,154997764,2,5,1,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1589903,154997782,2,5,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1589904,154997809,2,5,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
full_data = np.concatenate([df00.to_numpy(), df1.to_numpy()])
np.random.shuffle(full_data)

In [7]:
X = full_data[:, 1:-1]
y = full_data[:, -1]

protein_ohe = OneHotEncoder(sparse_output=False, dtype=np.float32)
onehot_protein = protein_ohe.fit_transform(X[:, 0:1])
X = np.concatenate([onehot_protein, X[:, 1:]], axis=1)

X.shape, y.shape

((3185869, 99), (3185869,))

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((2230108, 99), (955761, 99), (2230108,), (955761,))

//////////////////////////////////////////////////////////////////////////////////////////////

## Logistic Regression

In [None]:
model = LogisticRegression(max_iter=1000, n_jobs=10, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_train_prob = model.predict_proba(X_train)[:,1]
y_val_prob = model.predict_proba(X_val)[:,1]

y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [None]:
def find_best_threshold(pred_prob, y_true, search_space=np.linspace(0, 1, 100)):
    acc_list = []
    best_acc = 0
    best_th = 0

    for th in search_space:
        pred = [1 if prob > th else 0 for prob in pred_prob]
        acc = average_precision_score(y_true, pred)
        acc_list.append(acc)
        if acc > best_acc:
            best_acc = acc
            best_th = th

    print(f"Best mAP: {best_acc}%, Threshold: {best_th}")
    return best_th


def evaluate(y_train, y_val, y_train_prob, y_val_prob, y_train_pred, y_val_pred):
    # Evaluation
    train_classification_report = classification_report(y_train, y_train_pred)
    val_classification_report = classification_report(y_val, y_val_pred)

    train_auc = roc_auc_score(y_train, y_train_pred, multi_class='ovr')
    val_auc = roc_auc_score(y_val, y_val_pred, multi_class='ovr')

    train_map = average_precision_score(y_train, y_train_pred)
    val_map = average_precision_score(y_val, y_val_pred)

    print("Train mAP:", train_map)
    print("Validation mAP:", val_map)
    print("Train AUC:", train_auc)
    print("Validation AUC:", val_auc)
    print("Train Classification Report:\n", train_classification_report)
    print("Validation Classification Report:\n", val_classification_report)
    print('-'*50)

    # Threshold Finding
    best_th = find_best_threshold(y_val_prob, y_val)
    print('-'*50)

    # Evaluation
    y_train_pred = [1 if prob > best_th else 0 for prob in y_train_prob]
    y_val_pred = [1 if prob > best_th else 0 for prob in y_val_prob]

    train_classification_report = classification_report(y_train, y_train_pred)
    val_classification_report = classification_report(y_val, y_val_pred)

    train_auc = roc_auc_score(y_train, y_train_pred, multi_class='ovr')
    val_auc = roc_auc_score(y_val, y_val_pred, multi_class='ovr')

    train_map = average_precision_score(y_train, y_train_pred)
    val_map = average_precision_score(y_val, y_val_pred)

    print("Train mAP:", train_map)
    print("Validation mAP:", val_map)
    print("Train AUC:", train_auc)
    print("Validation AUC:", val_auc)
    print("Train Classification Report:\n", train_classification_report)
    print("Validation Classification Report:\n", val_classification_report)

In [None]:
evaluate(y_train, y_val, y_train_prob, y_val_prob, y_train_pred, y_val_pred)

## Logistic Regression CV

In [None]:
model = LogisticRegressionCV(cv=10, random_state=42, scoring='average_precision', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train), model.score(X_val, y_val)

In [None]:
y_train_prob = model.predict_proba(X_train)[:,1]
y_val_prob = model.predict_proba(X_val)[:,1]

y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [None]:
evaluate(y_train, y_val, y_train_prob, y_val_prob, y_train_pred, y_val_pred)

# Making Test Inference

In [None]:
import pandas as pd
import numpy as np

In [None]:
test_df = pd.read_parquet('test_features.parquet')
test_df

In [None]:
X_test = test_df.iloc[:, 1:-1].to_numpy()
X_test

In [None]:
test_prob = model.predict_proba(X_test)[:,1]
test_pred = model.predict(X_test)

In [None]:
test_prob

In [None]:
sub_df = pd.read_csv('sample_submission.csv.zip')
sub_df

In [None]:
sub_df.binds = test_prob
sub_df

In [None]:
import subprocess, os

file_name = f"submission_csv/_1_submission_lr.csv"
message = f"LR"
os.makedirs("submission_csv", exist_ok=True)

sub_df.to_csv(file_name, index=False)
display(pd.read_csv(file_name))

command = [
    "kaggle", "competitions", "submit",
    "-c", "leash-BELKA",
    "-f", file_name,
    "-m", message
]

subprocess.run(command)