## Model Creation on Chunks

In [None]:
train_len = 295246830
one_len = 1589906
zero_len = 293656924
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786,
         'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132,
         'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468,
         'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.types import LongType, IntegerType, StructType, StructField

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from xgboost.spark import SparkXGBClassifier

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

In [None]:
# for 256 Gb and 64 Cores
spark = (
    SparkSession
    .builder
    .appName("leash belka3")
    .config("spark.driver.memory", "48g")  # Increased driver memory
    .config("spark.executor.memory", "48g")  # Increased executor memory
    .config("spark.executor.instances", "16")  # 16 executors
    .config("spark.executor.cores", "4")  # 4 cores per executor
    .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
    .config("spark.local.dir", "temp")  # Specify a directory with enough space
    .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
    .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
    .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
    .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
    .master("local[64]")  # Use all 64 cores on the machine
    .getOrCreate()
)

spark

In [None]:
df0_features = spark.read.format('parquet').load('zero_features.parquet')
df1_features = spark.read.format('parquet').load('one_features.parquet')

full_df = df0_features.union(df1_features).orderBy(F.rand())

# print(df0_features.rdd.getNumPartitions())
# print(full_df.count())
# df0_features.printSchema()

In [None]:
full_df.take(2)

In [None]:
sample_df = full_df.sample(fraction=0.00001)
# print(sample_df.count())

In [None]:
from pyspark.ml.feature import OneHotEncoder

protein_ohe = OneHotEncoder(inputCol="protein", outputCol="protein_onehot")
protein_ohe = protein_ohe.fit(sample_df)

In [None]:
encoded_df = protein_ohe.transform(sample_df)

In [None]:
features_cols = encoded_df.columns[-1:] + encoded_df.columns[2:-2]
print(features_cols)

In [None]:
vectorAssembler = VectorAssembler(inputCols=features_cols, outputCol='features')

In [None]:
model = SparkXGBClassifier(num_workers=spark.sparkContext.defaultParallelism, label_col='y')

In [None]:
paramGrid = (
    ParamGridBuilder()
    .addGrid(model.max_depth, [3, 6] )
    .addGrid(model.n_estimators, [100, 1000] )
    .build()
)

evaluator = BinaryClassificationEvaluator(
    metricName="weightedPrecision",
    labelCol=model.getLabelCol(),
    rawPredictionCol=model.getPredictionCol()
)

cv = CrossValidator(estimator=model, evaluator=evaluator, estimatorParamMaps=paramGrid)

In [None]:
from pyspark.ml import Pipeline

pipe = Pipeline(stages=[vectorAssembler, cv])

In [None]:
pipemodel = pipe.fit(encoded_df)

//////////////////////////////////////////////////////////////////////////////////////////////

## Logistic Regression

In [None]:
model = LogisticRegression(max_iter=1000, n_jobs=4, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_train_prob = model.predict_proba(X_train)[:,1]
y_val_prob = model.predict_proba(X_val)[:,1]

y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [None]:
def find_best_threshold(pred_prob, y_true, search_space=np.linspace(0, 1, 100)):
    acc_list = []
    best_acc = 0
    best_th = 0

    for th in search_space:
        pred = [1 if prob > th else 0 for prob in pred_prob]
        acc = average_precision_score(y_true, pred)
        acc_list.append(acc)
        if acc > best_acc:
            best_acc = acc
            best_th = th

    print(f"Best mAP: {best_acc}%, Threshold: {best_th}")
    return best_th


def evaluate(y_train, y_val, y_train_prob, y_val_prob, y_train_pred, y_val_pred):
    # Evaluation
    train_classification_report = classification_report(y_train, y_train_pred)
    val_classification_report = classification_report(y_val, y_val_pred)

    train_auc = roc_auc_score(y_train, y_train_pred, multi_class='ovr')
    val_auc = roc_auc_score(y_val, y_val_pred, multi_class='ovr')

    train_map = average_precision_score(y_train, y_train_pred)
    val_map = average_precision_score(y_val, y_val_pred)

    print("Train mAP:", train_map)
    print("Validation mAP:", val_map)
    print("Train AUC:", train_auc)
    print("Validation AUC:", val_auc)
    print("Train Classification Report:\n", train_classification_report)
    print("Validation Classification Report:\n", val_classification_report)
    print('-'*50)

    # Threshold Finding
    best_th = find_best_threshold(y_val_prob, y_val)
    print('-'*50)

    # Evaluation
    y_train_pred = [1 if prob > best_th else 0 for prob in y_train_prob]
    y_val_pred = [1 if prob > best_th else 0 for prob in y_val_prob]

    train_classification_report = classification_report(y_train, y_train_pred)
    val_classification_report = classification_report(y_val, y_val_pred)

    train_auc = roc_auc_score(y_train, y_train_pred, multi_class='ovr')
    val_auc = roc_auc_score(y_val, y_val_pred, multi_class='ovr')

    train_map = average_precision_score(y_train, y_train_pred)
    val_map = average_precision_score(y_val, y_val_pred)

    print("Train mAP:", train_map)
    print("Validation mAP:", val_map)
    print("Train AUC:", train_auc)
    print("Validation AUC:", val_auc)
    print("Train Classification Report:\n", train_classification_report)
    print("Validation Classification Report:\n", val_classification_report)

In [None]:
evaluate(y_train, y_val, y_train_prob, y_val_prob, y_train_pred, y_val_pred)

## Logistic Regression CV

In [None]:
model = LogisticRegressionCV(cv=10, random_state=42, scoring='average_precision', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train), model.score(X_val, y_val)

In [None]:
y_train_prob = model.predict_proba(X_train)[:,1]
y_val_prob = model.predict_proba(X_val)[:,1]

y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [None]:
evaluate(y_train, y_val, y_train_prob, y_val_prob, y_train_pred, y_val_pred)

# Making Test Inference

In [None]:
import pandas as pd
import numpy as np

In [None]:
test_df = pd.read_parquet('test_features.parquet')
test_df

In [None]:
X_test = test_df.iloc[:, 1:-1].to_numpy()
X_test

In [None]:
test_prob = model.predict_proba(X_test)[:,1]
test_pred = model.predict(X_test)

In [None]:
test_prob

In [None]:
sub_df = pd.read_csv('sample_submission.csv.zip')
sub_df

In [None]:
sub_df.binds = test_prob
sub_df

In [None]:
import subprocess, os

file_name = f"submission_csv/_1_submission_lr.csv"
message = f"LR"
os.makedirs("submission_csv", exist_ok=True)

sub_df.to_csv(file_name, index=False)
display(pd.read_csv(file_name))

command = [
    "kaggle", "competitions", "submit",
    "-c", "leash-BELKA",
    "-f", file_name,
    "-m", message
]

subprocess.run(command)