# Calculating Smiles Descriptors

In [1]:
train_len = 295246830
one_len = 1589906
zero_len = 293656924
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786,
         'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132,
         'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468,
         'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, IntegerType, StructType, StructField, ArrayType, DoubleType, StringType
from pyspark.ml.linalg import SparseVector

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StringIndexerModel, OneHotEncoderModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

import pandas as pd
import numpy as np
import joblib

from xgboost.spark import SparkXGBClassifier

from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, rdmolops, AllChem, rdchem, rdEHTTools, rdMolDescriptors
from tqdm.auto import tqdm
from padelpy import from_smiles
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
# # for 256 Gb and 64 Cores
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     # .config("spark.local.dir", "/scratch/23m1521/temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark
spark = (
    SparkSession
    .builder
    .appName("leash belka34")
    .config("spark.driver.memory", "64g")  # Increased driver memory for large jobs
    .config("spark.executor.memory", "64g")  # Increased executor memory
    .config("spark.executor.instances", "32")  # 32 executors
    .config("spark.executor.cores", "2")  # 2 cores per executor
    .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
    .config("spark.local.dir", "temp")  # Ensure high-speed storage
    .config("spark.shuffle.file.buffer", "1024k")  # Larger shuffle buffer for better IO
    .config("spark.memory.fraction", "0.85")  # Increased memory for tasks
    .config("spark.shuffle.memoryFraction", "0.7")  # Increased shuffle memory
    .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
    .master("local[*]")  # Use all 64 cores on the machine
    .getOrCreate()
)
spark

# SparkSession for 128 GB RAM and 64 cores
# spark = (
#     SparkSession
#     .builder
#     .appName("Optimized Spark for 128GB RAM and 64 Cores")
#     .config("spark.driver.memory", "64g")  # 64GB for driver memory
#     .config("spark.executor.memory", "64g")  # 64GB for executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor (total = 64 cores)
#     .config("spark.driver.maxResultSize", "8g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Temp directory with enough space
#     .config("spark.shuffle.file.buffer", "512k")  # Increased shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# SynapseML 
# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "48g")  # Increased driver memory
#     .config("spark.executor.memory", "48g")  # Increased executor memory
#     .config("spark.executor.instances", "16")  # 16 executors
#     .config("spark.executor.cores", "4")  # 4 cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.6")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx48g")  # JVM heap size for executors
#     .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:1.0.8")
#     .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
#     .master("local[64]")  # Use all 64 cores on the machine
#     .getOrCreate()
# )

# spark

# spark = (
#     SparkSession
#     .builder
#     .appName("leash belka3")
#     .config("spark.driver.memory", "64g")  # Increased driver memory
#     .config("spark.executor.memory", "64g")  # Increased executor memory
#     .config("spark.executor.instances", "8")  # Reduced number of executors
#     .config("spark.executor.cores", "8")  # Increased cores per executor
#     .config("spark.driver.maxResultSize", "4g")  # Driver result size limit
#     .config("spark.local.dir", "temp")  # Specify a directory with enough space
#     .config("spark.shuffle.file.buffer", "128k")  # Shuffle buffer size
#     .config("spark.memory.fraction", "0.8")  # Memory fraction for tasks
#     .config("spark.shuffle.memoryFraction", "0.7")  # Shuffle memory fraction
#     .config("spark.executor.javaOptions", "-Xmx64g")  # JVM heap size for executors
#     .config("spark.sql.shuffle.partitions", "1000")  # Increase shuffle partitions
#     .config("spark.ui.enabled", "true")  # Enable Spark UI
#     .master("local[8]")  # Reduced number of cores for local mode
#     .getOrCreate()
# )

# spark


25/01/01 22:57:23 WARN Utils: Your hostname, kanjur resolves to a loopback address: 127.0.1.1; using 10.119.2.14 instead (on interface eno3)
25/01/01 22:57:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/01 22:57:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/01 22:57:23 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


## Descriptors Dataset

In [3]:
full_df_features = spark.read.format('parquet').load('train_descriptors.parquet')

                                                                                

In [4]:
full_df_features.show()

24/12/31 21:27:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---------+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+---------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+------------------+--------------+-------------------+------------------+---------------------+--------------+---------

### Tokens Dataset

In [5]:
full_tokens_df = spark.read.format('parquet').load('zero_features.parquet').union(spark.read.format('parquet').load('one_features.parquet'))
full_tokens_df.show()

                                                                                

+---------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|       id|protein| a1| a2| a3| a4| a5| a6| a7| a8| a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|a19|a20|a21|a22|a23|a24| b1| b2| b3| b4| b5| b6| b7| b8| b9|b10|b11|b12|b13|b14|b15|b16|b17|b18|b19|b20|b21|b22|b23|b24| c1| c2| c3| c4| c5| c6| c7| c8| c9|c10|c11|c12|c13|c14|c15|c16|c17|c18|c19|c20|c21|c22|c23|c24| d1| d2| d3| d4| d5| d6| d7| d8| d9|d10|d11|d12|d13|d14|d15|d16|d17|d18|d19|d20|d21|d22|d23|d24|  y|
+---------+-------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-

## Joining Both Dataset

In [8]:
full_feat_tok_df = (
    full_df_features
    .alias("feat")
    .join(
        full_tokens_df.alias("tok"), how="inner", on=full_df_features.id==full_tokens_df.id
    )
)
full_feat_tok_df.show()

[Stage 15:>                                                         (0 + 1) / 1]

+----+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+--------------+-----------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+------------------+--------------------+------------------+---------------------+--------------+----------

                                                                                

### Droping `id` column

In [12]:
full_feat_tok_df = full_feat_tok_df.drop(full_tokens_df.id, full_tokens_df.y)
full_feat_tok_df.show()

[Stage 25:>                                                         (0 + 1) / 1]

+----+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+--------------+-----------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+------------------+--------------------+------------------+---------------------+--------------+----------

                                                                                

### Droping `protein` column

In [15]:
full_feat_tok_df = full_feat_tok_df.drop(full_tokens_df.protein)
full_feat_tok_df.show()

[Stage 30:>                                                         (0 + 1) / 1]

+----+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+--------------+-----------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+------------------+--------------------+------------------+---------------------+--------------+----------

                                                                                

In [None]:
# +----+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+--------------+-----------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+------------------+--------------------+------------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+--------------------+----------------------+---------------------+-----------------+-------------------+------------------+------------------+--------------------------+-------------------+----------------------+---------------------+---------------------+----------------------+-------------------------+--------------------------+--------------------------+-----------------------+-----------------+------------------+-------------------------+----------------------------+---------------------------+-----------------------+----------------------+---------------------------+--------------------------+--------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
# |  id|binds|        bb1_MolWt|bb1_ExactMolWt|      bb1_MolLogP|         bb1_TPSA|bb1_NumRotatableBonds|bb1_NumHDonors|bb1_NumHAcceptors|bb1_FormalCharge|   bb1_FractionCSP3|bb1_NumHeavyAtoms|bb1_NumAromaticRings|bb1_NumAliphaticRings|bb1_NumSaturatedRings|bb1_NumHeteroatoms|bb1_NumRings|bb1_MolVolume|bb1_RadiusOfGyration|bb1_InertialShapeFactor|bb1_AromaticProportion|bb1_HBondPotential|bb1_Lipophilicity|bb1_ChargeDistribution|bb1_ElectroNegativity|         bb2_MolWt|bb2_ExactMolWt|bb2_MolLogP|bb2_TPSA|bb2_NumRotatableBonds|bb2_NumHDonors|bb2_NumHAcceptors|bb2_FormalCharge|bb2_FractionCSP3|bb2_NumHeavyAtoms|bb2_NumAromaticRings|bb2_NumAliphaticRings|bb2_NumSaturatedRings|bb2_NumHeteroatoms|bb2_NumRings|bb2_MolVolume|bb2_RadiusOfGyration|bb2_InertialShapeFactor|bb2_AromaticProportion|bb2_HBondPotential|bb2_Lipophilicity|bb2_ChargeDistribution|bb2_ElectroNegativity|         bb3_MolWt|    bb3_ExactMolWt|         bb3_MolLogP|          bb3_TPSA|bb3_NumRotatableBonds|bb3_NumHDonors|bb3_NumHAcceptors|bb3_FormalCharge|   bb3_FractionCSP3|bb3_NumHeavyAtoms|bb3_NumAromaticRings|bb3_NumAliphaticRings|bb3_NumSaturatedRings|bb3_NumHeteroatoms|bb3_NumRings|bb3_MolVolume|bb3_RadiusOfGyration|bb3_InertialShapeFactor|bb3_AromaticProportion|bb3_HBondPotential|   bb3_Lipophilicity|bb3_ChargeDistribution|bb3_ElectroNegativity|   molecule_MolWt|molecule_ExactMolWt|  molecule_MolLogP|     molecule_TPSA|molecule_NumRotatableBonds|molecule_NumHDonors|molecule_NumHAcceptors|molecule_FormalCharge|molecule_FractionCSP3|molecule_NumHeavyAtoms|molecule_NumAromaticRings|molecule_NumAliphaticRings|molecule_NumSaturatedRings|molecule_NumHeteroatoms|molecule_NumRings|molecule_MolVolume|molecule_RadiusOfGyration|molecule_InertialShapeFactor|molecule_AromaticProportion|molecule_HBondPotential|molecule_Lipophilicity|molecule_ChargeDistribution|molecule_ElectroNegativity|protein_onehot| a1| a2| a3| a4| a5| a6| a7| a8| a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|a19|a20|a21|a22|a23|a24| b1| b2| b3| b4| b5| b6| b7| b8| b9|b10|b11|b12|b13|b14|b15|b16|b17|b18|b19|b20|b21|b22|b23|b24| c1| c2| c3| c4| c5| c6| c7| c8| c9|c10|c11|c12|c13|c14|c15|c16|c17|c18|c19|c20|c21|c22|c23|c24| d1| d2| d3| d4| d5| d6| d7| d8| d9|d10|d11|d12|d13|d14|d15|d16|d17|d18|d19|d20|d21|d22|d23|d24|
# +----+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+--------------+-----------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+------------------+--------------------+------------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+--------------------+----------------------+---------------------+-----------------+-------------------+------------------+------------------+--------------------------+-------------------+----------------------+---------------------+---------------------+----------------------+-------------------------+--------------------------+--------------------------+-----------------------+-----------------+------------------+-------------------------+----------------------------+---------------------------+-----------------------+----------------------+---------------------------+--------------------------+--------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
# |  26|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           197.665| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|           266.345|     266.174275944|              2.0373|             73.38|                    1|             1|                5|               0| 0.6923076923076923|               19|                   1|                    1|                    1|                 6|           2|            0|                   0|                      0|   0.05263157894736842|                 6|              2.0373|                     0|    72.71740000000004|790.2300000000002|  791.2447495279998|3.7822000000000013|            160.45|                        13|                  4|                    12|                    0|               0.4375|                    47|                        3|                         1|                         1|                     15|                4|                 0|                        0|                           0|        0.06382978723404255|                     16|    3.7822000000000013|                          0|         173.1317999999996|     (2,[],[])|  9|  5| 10| 20|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  1|  2|  4|  1|  2|  4|  1|  1|  2|  5| 12|  6|  3| 12|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  2|  5|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |  29|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           197.665| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|252.31799999999998|      252.15862588|              1.9777| 84.24000000000001|                    0|             2|                4|               0| 0.6666666666666666|               18|                   1|                    1|                    0|                 6|           2|            0|                   0|                      0|   0.05555555555555555|                 6|              1.9777|                     0|    67.80310000000003|776.2030000000003|  777.2290994639999|3.7226000000000017|            171.31|                        12|                  5|                    11|                    0|  0.41935483870967744|                    46|                        3|                         1|                         0|                     15|                4|                 0|                        0|                           0|        0.06521739130434782|                     16|    3.7226000000000017|                          0|        168.21749999999966|     (2,[],[])|  9|  5|  9| 19|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  1|  2|  2|  0|  1|  2|  4|  1|  2|  4|  1|  1|  2|  5| 12|  6|  3| 12|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  2|  5|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# | 474|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           197.665| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|156.61600000000004|     156.045425968|             1.50212|             38.91|                    1|             1|                2|               0| 0.2857142857142857|               10|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|                   0.1|                 3|             1.50212|                     0|   41.875400000000006|680.5010000000002|  681.1158995520001| 3.239020000000002|            125.98|                        13|                  4|                     9|                    0|   0.2692307692307692|                    38|                        3|                         0|                         0|                     12|                3|                 0|                        0|                           0|        0.07894736842105263|                     13|     3.239020000000002|                          0|         142.8547999999999| (2,[0],[1.0])|  9|  5|  3| 13|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  0|  2|  1|  1|  1|  4| 12|  6|  5| 14|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  4|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# | 964|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           197.665| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|143.22999999999996|     143.131014164|  0.8862999999999996|             46.25|                    3|             2|                2|               0|                1.0|               10|                   0|                    1|                    1|                 2|           1|            0|                   0|                      0|                   0.0|                 4|  0.8862999999999996|                     0|    41.63620000000002|667.1150000000004|  668.2014877480001|2.6231999999999998|            133.32|                        15|                  5|                     9|                    0|  0.48148148148148145|                    38|                        2|                         1|                         1|                     11|                3|                 0|                        0|                           0|        0.05263157894736842|                     14|    2.6231999999999998|                          0|        142.61559999999992| (2,[1],[1.0])|  9|  5|  8| 18|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  1|  3|  1|  1|  1|  4| 12|  6|  0|  9|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |1677|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|157.66600000000003|     157.032812684| 0.13550000000000012|             43.09|                    3|             1|                2|               0|                1.0|                8|                   0|                    0|                    0|                 4|           0|            0|                   0|                      0|                   0.0|                 3| 0.13550000000000012|                     0|    39.85280000000001|645.0900000000001|      646.126608556|1.4506000000000001|130.16000000000003|                        15|                  4|                     9|                    0|    0.391304347826087|                    35|                        2|                         0|                         0|                     12|                2|                 0|                        0|                           0|        0.05714285714285714|                     13|    1.4506000000000001|                          0|        133.58419999999998| (2,[0],[1.0])|  9|  5|  5| 14|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  1|  2|  4|  1|  1|  3|  1|  1|  1|  4| 12|  6|  0|  9|  1|  0|  0|  0|  0|  1|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |1697|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|125.17499999999998|125.09529735199999|              0.3617|             43.84|                    2|             1|                3|               0|                0.5|                9|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|    0.1111111111111111|                 4|              0.3617|                     0|    35.79239999999999|649.0600000000002|  650.1657709360001|2.0985999999999994|130.91000000000003|                        14|                  4|                    10|                    0|                 0.32|                    37|                        3|                         0|                         0|                     12|                3|                 0|                        0|                           0|        0.08108108108108109|                     14|    2.0985999999999994|                          0|        136.77179999999996|     (2,[],[])|  9|  5|  3| 13|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  0|  2|  1|  1|  1|  4| 12|  6|  3| 12|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  2|  5|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |1806|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|           152.153|     152.058577496|              0.4504|             65.21|                    1|             1|                4|               0|0.14285714285714285|               11|                   1|                    0|                    0|                 4|           1|            0|                   0|                      0|   0.09090909090909091|                 5|              0.4504|                     0|    39.98890000000001|676.0380000000001|  677.1290510800001|2.1952999999999996|152.28000000000003|                        13|                  4|                    11|                    0|  0.23076923076923078|                    39|                        3|                         0|                         0|                     13|                3|                 0|                        0|                           0|        0.07692307692307693|                     15|    2.1952999999999996|                          0|        140.40329999999992| (2,[0],[1.0])|  9|  5|  2| 12|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  1|  2|  4|  1|  2|  4|  1|  1|  1|  4| 12|  6|  5| 14|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  4|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |1950|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|           179.183|     179.080709908|-0.40840000000000026|             78.33|                    2|             1|                6|               0| 0.2857142857142857|               13|                   2|                    0|                    0|                 6|           2|            0|                   0|                      0|   0.15384615384615385|                 7|-0.40840000000000026|                     0|    45.22540000000001|703.0680000000002|  704.1511834920001|1.3284999999999998|165.40000000000003|                        14|                  4|                    13|                    0|   0.2692307692307692|                    41|                        4|                         0|                         0|                     15|                4|                 0|                        0|                           0|         0.0975609756097561|                     17|    1.3284999999999998|                          0|         146.2047999999999| (2,[0],[1.0])|  9|  5|  2| 12|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  1|  3|  1|  1|  1|  4| 12|  6|  5| 14|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  4|  7|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |2040|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|194.69100000000003|194.03929503199998|              0.5482|             67.59|                    3|             2|                4|               0|                0.6|               11|                   1|                    0|                    0|                 6|           1|            0|                   0|                      0|   0.09090909090909091|                 6|              0.5482|                     0|    48.74110000000001|682.1150000000001|      683.133090904|1.8632999999999995|154.66000000000003|                        15|                  5|                    11|                    0|   0.3333333333333333|                    38|                        3|                         0|                         0|                     14|                3|                 0|                        0|                           0|        0.07894736842105263|                     16|    1.8632999999999995|                          0|        142.47249999999994| (2,[0],[1.0])|  9|  5|  4| 13|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  1|  2|  2|  0|  0|  1|  4|  1|  0|  2|  1|  1|  1|  4| 12|  6|  2| 11|  1|  0|  0|  0|  0|  1|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  6|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |2214|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|148.59300000000002|     148.040340588|  0.8635199999999998|             52.05|                    1|             1|                3|               0|                0.4|                9|                   1|                    0|                    0|                 4|           1|            0|                   0|                      0|    0.1111111111111111|                 4|  0.8635199999999998|                     0|   36.379400000000004|636.0170000000002|       637.13413646|2.1786199999999996|            139.12|                        13|                  4|                    10|                    0|   0.2916666666666667|                    36|                        3|                         0|                         0|                     12|                3|                 0|                        0|                           0|        0.08333333333333333|                     14|    2.1786199999999996|                          0|        130.11080000000004| (2,[0],[1.0])|  9|  5|  3| 12|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  0|  2|  1|  1|  1|  4| 12|  6|  3| 12|  1|  0|  0|  0|  0|  1|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  4|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |2250|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|239.14600000000002|     238.063968492|   1.745620000000001|             59.14|                    2|             2|                3|               0| 0.4444444444444444|               14|                   1|                    0|                    0|                 5|           1|            0|                   0|                      0|   0.07142857142857142|                 5|   1.745620000000001|                     0|    61.99520000000004|690.1090000000003|      691.181086652|2.6389199999999997|146.20999999999998|                        14|                  5|                    10|                    0|  0.32142857142857145|                    40|                        3|                         0|                         0|                     12|                3|                 0|                        0|                           0|                      0.075|                     15|    2.6389199999999997|                          0|        148.47859999999986| (2,[0],[1.0])|  9|  5|  6| 14|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  1|  3|  1|  1|  1|  4| 12|  6|  5| 14|  1|  0|  0|  0|  0|  1|  2|  0|  0|  1|  2|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  4|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |2453|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|           171.618|     171.062633504|              1.8023|             26.02|                    3|             1|                1|               0|                1.0|               10|                   0|                    1|                    1|                 4|           1|            0|                   0|                      0|                   0.0|                 2|              1.8023|                     0|    38.59040000000002|659.0420000000003|      660.156429376| 3.117400000000001|113.08999999999999|                        15|                  4|                     8|                    0|                 0.44|                    37|                        2|                         1|                         1|                     12|                3|                 0|                        0|                           0|        0.05405405405405406|                     12|     3.117400000000001|                          0|        132.32180000000002|     (2,[],[])|  9|  5|  7| 16|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  0|  2|  1|  1|  1|  4| 12|  6|  0|  9|  1|  0|  0|  0|  0|  1|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  2|  2|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |2509|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|            212.68|     212.071640716|              1.1779|             55.12|                    1|             2|                2|               0|                0.3|               14|                   1|                    1|                    0|                 4|           2|            0|                   0|                      0|   0.07142857142857142|                 4|              1.1779|                     0|   58.568100000000015|700.1040000000002|      701.165436588|2.4929999999999994|            142.19|                        13|                  5|                     9|                    0|  0.27586206896551724|                    41|                        3|                         1|                         0|                     12|                4|                 0|                        0|                           0|        0.07317073170731707|                     14|    2.4929999999999994|                          0|        152.29949999999985| (2,[1],[1.0])|  9|  5|  5| 14|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  1|  2|  4|  1|  1|  3|  1|  1|  2|  5| 12|  6|  6| 15|  1|  0|  0|  0|  0|  1|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |2529|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|192.68999999999997|     192.102940844|  0.6732999999999996|             55.12|                    4|             2|                2|               0|              0.875|               12|                   0|                    1|                    1|                 4|           1|            0|                   0|                      0|                   0.0|                 4|  0.6732999999999996|                     0|    51.27010000000003|680.1140000000001|      681.196736716|1.9884000000000004|            142.19|                        16|                  5|                     9|                    0|   0.4444444444444444|                    39|                        2|                         1|                         1|                     12|                3|                 0|                        0|                           0|        0.05128205128205128|                     14|    1.9884000000000004|                          0|         145.0014999999999| (2,[0],[1.0])|  9|  5|  9| 18|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  1|  2|  4|  1|  1|  3|  1|  1|  2|  5| 12|  6|  0|  9|  1|  0|  0|  0|  0|  1|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |2927|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026| 204.2729999999999|     204.126263132|  1.3813000000000002|46.330000000000005|                    2|             1|                2|               0| 0.4166666666666667|               15|                   1|                    1|                    1|                 3|           2|            0|                   0|                      0|   0.06666666666666667|                 3|  1.3813000000000002|                     0|    59.47090000000003|728.1580000000004|      729.196736716|3.1182000000000007|133.39999999999998|                        14|                  4|                     9|                    0|   0.3225806451612903|                    43|                        3|                         1|                         1|                     12|                4|                 0|                        0|                           0|        0.06976744186046512|                     13|    3.1182000000000007|                          0|        160.45029999999974|     (2,[],[])|  9|  5|  6| 16|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  1|  2|  4|  1|  1|  3|  1|  1|  2|  5| 12|  6|  6| 15|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |3091|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|197.66499999999996| 197.060741684|     1.5791|   35.25|                    3|             1|                2|               0|             0.2|               13|                   1|                    0|                    0|                 3|           1|            0|                   0|                      0|   0.07692307692307693|                 3|           1.5791|                     0|   55.909400000000026|163.22400000000005|     163.110947416|               1.264|             42.15|                    1|             1|                3|               0| 0.4444444444444444|               12|                   1|                    1|                    1|                 3|           2|            0|                   0|                      0|   0.08333333333333333|                 4|               1.264|                     0|    50.09640000000002|687.1090000000003|  688.1814210000001|3.0089000000000006|129.22000000000003|                        13|                  4|                    10|                    0|  0.32142857142857145|                    40|                        3|                         1|                         1|                     12|                4|                 0|                        0|                           0|                      0.075|                     14|    3.0089000000000006|                          0|        150.51079999999985| (2,[1],[1.0])|  9|  5|  4| 14|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  1|  0|  2|  1|  1|  2|  5| 12|  6|  5| 14|  1|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  4|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |3506|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           117.151| 117.057849224|     1.2501|   26.02|                    0|             1|                1|               0|             0.0|                9|                   1|                    0|                    0|                 1|           1|            0|                   0|                      0|    0.1111111111111111|                 2|           1.2501|                     0|    38.78940000000001|           117.151|     117.057849224|  1.2501000000000002|             26.02|                    0|             1|                1|               0|                0.0|                9|                   1|                    0|                    0|                 1|           1|            0|                   0|                      0|    0.1111111111111111|                 2|  1.2501000000000002|                     0|    38.78940000000001|596.9830000000001|  598.1021080600001|3.0958000000000006|            103.86|                         9|                  4|                     7|                    0|                 0.12|                    34|                        3|                         0|                         0|                      9|                3|                 0|                        0|                           0|        0.08823529411764706|                     11|    3.0958000000000006|                          0|                  128.7668|     (2,[],[])|  9|  2|  2| 10|  1|  1|  1|  3|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  0|  0|  1|  1|  1|  1|  4| 12|  6|  6| 15|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |3764|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           117.151| 117.057849224|     1.2501|   26.02|                    0|             1|                1|               0|             0.0|                9|                   1|                    0|                    0|                 1|           1|            0|                   0|                      0|    0.1111111111111111|                 2|           1.2501|                     0|    38.78940000000001|           214.718|       214.0542764| -0.3515999999999996|63.400000000000006|                    3|             1|                3|               0|                1.0|               12|                   0|                    1|                    1|                 6|           1|            0|                   0|                      0|                   0.0|                 4| -0.3515999999999996|                     0|    50.82620000000003|          658.089|      659.121857524|1.0643000000000007|141.23999999999995|                        12|                  4|                     9|                    0|    0.391304347826087|                    36|                        2|                         1|                         1|                     13|                3|                 0|                        0|                           0|        0.05555555555555555|                     13|    1.0643000000000007|                          0|        134.12059999999997|     (2,[],[])|  9|  2|  7| 14|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  2|  3|  4|  0|  2|  3|  1|  1|  2|  5| 12|  6|  0|  9|  1|  0|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |4590|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           117.151| 117.057849224|     1.2501|   26.02|                    0|             1|                1|               0|             0.0|                9|                   1|                    0|                    0|                 1|           1|            0|                   0|                      0|    0.1111111111111111|                 2|           1.2501|                     0|    38.78940000000001|241.12199999999999|240.05446642799998|  0.6108999999999998|             76.82|                    2|             2|                4|               0| 0.7142857142857143|               14|                   1|                    1|                    1|                 7|           2|            0|                   0|                      0|   0.07142857142857142|                 6|  0.6108999999999998|                     0|    56.94710000000002|648.0320000000002|  649.1453698400002| 1.604999999999999|154.66000000000003|                        11|                  5|                    10|                    0|   0.3333333333333333|                    37|                        3|                         1|                         1|                     13|                4|                 0|                        0|                           0|        0.08108108108108109|                     15|     1.604999999999999|                          0|        132.99349999999995| (2,[0],[1.0])|  9|  2|  7| 13|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  1|  2|  2|  0|  0|  1|  4|  0|  1|  2|  1|  1|  1|  4| 12|  6|  2| 11|  1|  0|  0|  0|  0|  0|  2|  0|  0|  0|  2|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  3|  6|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# |4823|    0|349.3860000000001| 349.131408088|3.391700000000002|75.63000000000001|                    6|             2|                3|               0|0.23809523809523808|               26|                   2|                    1|                    0|                 5|           3|            0|                   0|                      0|   0.07692307692307693|                 5|3.391700000000002|                     0|    97.69550000000004|           117.151| 117.057849224|     1.2501|   26.02|                    0|             1|                1|               0|             0.0|                9|                   1|                    0|                    0|                 1|           1|            0|                   0|                      0|    0.1111111111111111|                 2|           1.2501|                     0|    38.78940000000001|174.63099999999997|     174.055990652|              1.4325|             52.05|                    2|             1|                3|               0| 0.5714285714285714|               11|                   1|                    1|                    1|                 4|           2|            0|                   0|                      0|   0.09090909090909091|                 4|              1.4325|                     0|    43.61740000000002|618.0020000000002|  619.1235717760001|2.8484000000000007|            129.89|                        11|                  4|                     9|                    0|   0.2916666666666667|                    35|                        3|                         1|                         1|                     11|                4|                 0|                        0|                           0|        0.08571428571428572|                     13|    2.8484000000000007|                          0|        126.91180000000003|     (2,[],[])|  9|  2|  5| 12|  1|  1|  0|  2|  2|  0|  0|  2|  1|  0|  0|  1|  2|  0|  0|  1|  4|  0|  0|  1|  1|  1|  1|  4| 12|  6|  3| 12|  1|  0|  0|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  4|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
# +----+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+--------------+-----------+--------+---------------------+--------------+-----------------+----------------+----------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+------------------+------------------+--------------------+------------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+--------------------+----------------------+---------------------+-----------------+-------------------+------------------+------------------+--------------------------+-------------------+----------------------+---------------------+---------------------+----------------------+-------------------------+--------------------------+--------------------------+-----------------------+-----------------+------------------+-------------------------+----------------------------+---------------------------+-----------------------+----------------------+---------------------------+--------------------------+--------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
# only showing top 20 rows

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [5]:
def protein_onehot_TO_Array(df):
    # UDF to convert SparseVector to list
    def sparse_to_list(vec):
        return vec.toArray().tolist()

    sparse_to_list_udf = udf(sparse_to_list, ArrayType(DoubleType()))
    df = df.withColumn("protein_array", sparse_to_list_udf(df["protein_onehot"]))
    for i in range(len(df.select("protein_array").first()["protein_array"])):
        df = df.withColumn(f"protein_{i}", df["protein_array"][i])
    df = df.drop("protein_array")
    return df

In [6]:
full_df_features = protein_onehot_TO_Array(full_df_features)
full_df_features.show()

[Stage 3:>                                                          (0 + 1) / 1]

+---------+-----+-----------------+--------------+-----------------+-----------------+---------------------+--------------+-----------------+----------------+-------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+-----------------+----------------------+---------------------+---------+--------------+------------------+--------+---------------------+--------------+-----------------+----------------+------------------+-----------------+--------------------+---------------------+---------------------+------------------+------------+-------------+--------------------+-----------------------+----------------------+------------------+------------------+----------------------+---------------------+------------------+--------------+-------------------+------------------+---------------------+--------------+---------

                                                                                

## Assembling Columns

In [18]:
features_cols = full_feat_tok_df.columns[2:]
print(features_cols)

['bb1_MolWt', 'bb1_ExactMolWt', 'bb1_MolLogP', 'bb1_TPSA', 'bb1_NumRotatableBonds', 'bb1_NumHDonors', 'bb1_NumHAcceptors', 'bb1_FormalCharge', 'bb1_FractionCSP3', 'bb1_NumHeavyAtoms', 'bb1_NumAromaticRings', 'bb1_NumAliphaticRings', 'bb1_NumSaturatedRings', 'bb1_NumHeteroatoms', 'bb1_NumRings', 'bb1_MolVolume', 'bb1_RadiusOfGyration', 'bb1_InertialShapeFactor', 'bb1_AromaticProportion', 'bb1_HBondPotential', 'bb1_Lipophilicity', 'bb1_ChargeDistribution', 'bb1_ElectroNegativity', 'bb2_MolWt', 'bb2_ExactMolWt', 'bb2_MolLogP', 'bb2_TPSA', 'bb2_NumRotatableBonds', 'bb2_NumHDonors', 'bb2_NumHAcceptors', 'bb2_FormalCharge', 'bb2_FractionCSP3', 'bb2_NumHeavyAtoms', 'bb2_NumAromaticRings', 'bb2_NumAliphaticRings', 'bb2_NumSaturatedRings', 'bb2_NumHeteroatoms', 'bb2_NumRings', 'bb2_MolVolume', 'bb2_RadiusOfGyration', 'bb2_InertialShapeFactor', 'bb2_AromaticProportion', 'bb2_HBondPotential', 'bb2_Lipophilicity', 'bb2_ChargeDistribution', 'bb2_ElectroNegativity', 'bb3_MolWt', 'bb3_ExactMolWt', 'b

In [None]:
# joblib.dump(features_cols, 'features_cols_list.joblib')

In [19]:
vectorAssembler = VectorAssembler(inputCols=features_cols, outputCol='vectors')
full_feat_tok_df_vectors = vectorAssembler.transform(full_feat_tok_df).select('id', 'vectors', 'binds')
full_feat_tok_df_vectors.show()

[Stage 35:>                                                         (0 + 1) / 1]

+----+--------------------+-----+
|  id|             vectors|binds|
+----+--------------------+-----+
|  26|(190,[0,1,2,3,4,5...|    0|
|  29|(190,[0,1,2,3,4,5...|    0|
| 474|(190,[0,1,2,3,4,5...|    0|
| 964|(190,[0,1,2,3,4,5...|    0|
|1677|(190,[0,1,2,3,4,5...|    0|
|1697|(190,[0,1,2,3,4,5...|    0|
|1806|(190,[0,1,2,3,4,5...|    0|
|1950|(190,[0,1,2,3,4,5...|    0|
|2040|(190,[0,1,2,3,4,5...|    0|
|2214|(190,[0,1,2,3,4,5...|    0|
|2250|(190,[0,1,2,3,4,5...|    0|
|2453|(190,[0,1,2,3,4,5...|    0|
|2509|(190,[0,1,2,3,4,5...|    0|
|2529|(190,[0,1,2,3,4,5...|    0|
|2927|(190,[0,1,2,3,4,5...|    0|
|3091|(190,[0,1,2,3,4,5...|    0|
|3506|(190,[0,1,2,3,4,5...|    0|
|3764|(190,[0,1,2,3,4,5...|    0|
|4590|(190,[0,1,2,3,4,5...|    0|
|4823|(190,[0,1,2,3,4,5...|    0|
+----+--------------------+-----+
only showing top 20 rows



                                                                                

In [20]:
full_feat_tok_df_vectors.take(1)[0]

                                                                                

Row(id=26, vectors=SparseVector(190, {0: 349.386, 1: 349.1314, 2: 3.3917, 3: 75.63, 4: 6.0, 5: 2.0, 6: 3.0, 8: 0.2381, 9: 26.0, 10: 2.0, 11: 1.0, 13: 5.0, 14: 3.0, 18: 0.0769, 19: 5.0, 20: 3.3917, 22: 97.6955, 23: 197.665, 24: 197.0607, 25: 1.5791, 26: 35.25, 27: 3.0, 28: 1.0, 29: 2.0, 31: 0.2, 32: 13.0, 33: 1.0, 36: 3.0, 37: 1.0, 41: 0.0769, 42: 3.0, 43: 1.5791, 45: 55.9094, 46: 266.345, 47: 266.1743, 48: 2.0373, 49: 73.38, 50: 1.0, 51: 1.0, 52: 5.0, 54: 0.6923, 55: 19.0, 56: 1.0, 57: 1.0, 58: 1.0, 59: 6.0, 60: 2.0, 64: 0.0526, 65: 6.0, 66: 2.0373, 68: 72.7174, 69: 790.23, 70: 791.2447, 71: 3.7822, 72: 160.45, 73: 13.0, 74: 4.0, 75: 12.0, 77: 0.4375, 78: 47.0, 79: 3.0, 80: 1.0, 81: 1.0, 82: 15.0, 83: 4.0, 87: 0.0638, 88: 16.0, 89: 3.7822, 91: 173.1318, 94: 9.0, 95: 5.0, 96: 10.0, 97: 20.0, 98: 1.0, 99: 1.0, 101: 2.0, 102: 2.0, 105: 2.0, 106: 1.0, 109: 1.0, 110: 2.0, 112: 1.0, 113: 2.0, 114: 4.0, 115: 1.0, 116: 2.0, 117: 4.0, 118: 1.0, 119: 1.0, 120: 2.0, 121: 5.0, 122: 12.0, 123: 6.0,

In [21]:
full_feat_tok_df_vectors.write.mode('overwrite').format('parquet').save('full_feat_tok_df_vectors.parquet')

                                                                                

## Making Subset

In [3]:
full_feat_tok_df_vectors = spark.read.format('parquet').load('full_feat_tok_df_vectors.parquet')
print(full_feat_tok_df_vectors.count())
full_feat_tok_df_vectors.show()

                                                                                

295246830
+----+--------------------+-----+
|  id|             vectors|binds|
+----+--------------------+-----+
|  39|(190,[0,1,2,3,4,5...|    0|
|  98|(190,[0,1,2,3,4,5...|    0|
| 187|(190,[0,1,2,3,4,5...|    0|
| 496|(190,[0,1,2,3,4,5...|    0|
| 561|(190,[0,1,2,3,4,5...|    0|
| 845|(190,[0,1,2,3,4,5...|    0|
|1297|(190,[0,1,2,3,4,5...|    0|
|1733|(190,[0,1,2,3,4,5...|    0|
|1918|(190,[0,1,2,3,4,5...|    0|
|2451|(190,[0,1,2,3,4,5...|    0|
|2639|(190,[0,1,2,3,4,5...|    0|
|2745|(190,[0,1,2,3,4,5...|    0|
|2806|(190,[0,1,2,3,4,5...|    0|
|2832|(190,[0,1,2,3,4,5...|    0|
|3322|(190,[0,1,2,3,4,5...|    0|
|3751|(190,[0,1,2,3,4,5...|    0|
|3974|(190,[0,1,2,3,4,5...|    0|
|4188|(190,[0,1,2,3,4,5...|    0|
|4296|(190,[0,1,2,3,4,5...|    0|
|4733|(190,[0,1,2,3,4,5...|    0|
+----+--------------------+-----+
only showing top 20 rows



In [4]:
df_vectors = full_feat_tok_df_vectors.sampleBy('binds', fractions={0: 0.005150, 1: 1.0}, seed=42)
print(df_vectors.groupBy("binds").count().collect())
print(df_vectors.count())
df_vectors.show()

                                                                                

[Row(binds=0, count=1511471), Row(binds=1, count=1589906)]


                                                                                

3101377
+------+--------------------+-----+
|    id|             vectors|binds|
+------+--------------------+-----+
| 25160|(190,[0,1,2,3,4,5...|    1|
| 48001|(190,[0,1,2,3,4,5...|    0|
| 58144|(190,[0,1,2,3,4,5...|    0|
|148672|(190,[0,1,2,3,4,5...|    1|
|233007|(190,[0,1,2,3,4,5...|    0|
|234967|(190,[0,1,2,3,4,5...|    0|
|273952|(190,[0,1,2,3,4,5...|    0|
|320887|(190,[0,1,2,3,4,5...|    0|
|330585|(190,[0,1,2,3,4,5...|    0|
|330737|(190,[0,1,2,3,4,5...|    0|
|359870|(190,[0,1,2,3,4,5...|    0|
|412850|(190,[0,1,2,3,4,5...|    0|
|433891|(190,[0,1,2,3,4,5...|    1|
|465382|(190,[0,1,2,3,4,5...|    1|
|468070|(190,[0,1,2,3,4,5...|    0|
|504397|(190,[0,1,2,3,4,5...|    0|
|510409|(190,[0,1,2,3,4,5...|    1|
|510880|(190,[0,1,2,3,4,5...|    1|
|511051|(190,[0,1,2,3,4,5...|    1|
|560671|(190,[0,1,2,3,4,5...|    0|
+------+--------------------+-----+
only showing top 20 rows



///////////////////////////////////////////////////////////////////////////////////////////////////

In [9]:
full_df_vectors = full_df_vectors.withColumn(
    "protein",
    when((full_df_vectors.protein_0 == 0) & (full_df_vectors.protein_1 == 0), 2)  # Both protein_0 and protein_1 are zero
    .when(full_df_vectors.protein_0 == 1, 0)                         # protein_0 is non-zero
    .when(full_df_vectors.protein_1 == 1, 1)                         # protein_1 is non-zero
    .otherwise(None)                                                 # For other cases, if any
)

full_df_vectors.show()

[Stage 5:>                                                          (0 + 1) / 1]

+---------+--------------------+-----+---------+---------+-------+
|       id|             vectors|binds|protein_0|protein_1|protein|
+---------+--------------------+-----+---------+---------+-------+
|193986560|[360.369000000000...|    0|      0.0|      0.0|      2|
|193986561|[360.369000000000...|    0|      1.0|      0.0|      0|
|193986562|[360.369000000000...|    0|      0.0|      1.0|      1|
|193986563|[360.369000000000...|    0|      0.0|      0.0|      2|
|193986564|[360.369000000000...|    0|      1.0|      0.0|      0|
|193986565|[360.369000000000...|    0|      0.0|      1.0|      1|
|193986566|[360.369000000000...|    0|      0.0|      0.0|      2|
|193986567|[360.369000000000...|    0|      1.0|      0.0|      0|
|193986568|[360.369000000000...|    0|      0.0|      1.0|      1|
|193986569|[360.369000000000...|    0|      0.0|      0.0|      2|
|193986570|[360.369000000000...|    0|      1.0|      0.0|      0|
|193986571|[360.369000000000...|    0|      0.0|      1.0|    

                                                                                

In [21]:
full_df_vectors.groupBy("protein").count().collect()

                                                                                

[Row(protein=1, count=98415610),
 Row(protein=2, count=98415610),
 Row(protein=0, count=98415610)]

In [28]:
print(full_df_vectors.groupBy("binds").count().collect())



[Row(binds=0, count=293656924), Row(binds=1, count=1589906)]


                                                                                

In [17]:
293656924*0.005150, 1589906*1

(1512333.1586, 1589906)

In [19]:
df_vectors = full_df_vectors.sampleBy('binds', fractions={0: 0.005150, 1: 1.0}, seed=42)
print(df_vectors.groupBy("binds").count().collect())
print(df_vectors.groupBy("protein").count().collect())
print(df_vectors.count())
df_vectors.show()

                                                                                

[Row(binds=0, count=1513713), Row(binds=1, count=1589906)]


                                                                                

[Row(protein=1, count=912994), Row(protein=2, count=1227916), Row(protein=0, count=962709)]
3103619
+---------+--------------------+-----+---------+---------+-------+
|       id|             vectors|binds|protein_0|protein_1|protein|
+---------+--------------------+-----+---------+---------+-------+
|193986815|[360.369000000000...|    0|      0.0|      0.0|      2|
|193986873|(94,[0,1,2,3,4,5,...|    0|      1.0|      0.0|      0|
|193987759|[360.369000000000...|    0|      0.0|      1.0|      1|
|193987768|[360.369000000000...|    0|      0.0|      1.0|      1|
|193987965|[360.369000000000...|    0|      1.0|      0.0|      0|
|193988203|[360.369000000000...|    0|      0.0|      1.0|      1|
|193988251|[360.369000000000...|    0|      0.0|      1.0|      1|
|193988253|[360.369000000000...|    0|      1.0|      0.0|      0|
|193988384|(94,[0,1,2,3,4,5,...|    0|      0.0|      0.0|      2|
|193988659|[360.369000000000...|    0|      0.0|      1.0|      1|
|193988937|(94,[0,1,2,3,4,5,.

In [15]:
from pyspark.sql import functions as F

# For protein_0 = 1
result_protein_0_1 = full_df_vectors.filter(full_df_vectors.protein_0 == 1) \
                       .groupBy("binds") \
                       .agg(F.count("*").alias("binds_count"))

# For protein_2 = 1
result_protein_1_1 = full_df_vectors.filter(full_df_vectors.protein_1 == 1) \
                       .groupBy("binds") \
                       .agg(F.count("*").alias("binds_count"))

# For protein_0 = 0 and protein_1 = 0
result_protein_0_0_and_protein_1_0 = full_df_vectors.filter((full_df_vectors.protein_0 == 0) & (full_df_vectors.protein_1 == 0)) \
                                      .groupBy("binds") \
                                      .agg(F.count("*").alias("binds_count"))

# Show results
result_protein_0_1.show()
result_protein_1_1.show()
result_protein_0_0_and_protein_1_0.show()

                                                                                

+-----+-----------+
|binds|binds_count|
+-----+-----------+
|    0|   97958646|
|    1|     456964|
+-----+-----------+



                                                                                

+-----+-----------+
|binds|binds_count|
+-----+-----------+
|    0|   98007200|
|    1|     408410|
+-----+-----------+





+-----+-----------+
|binds|binds_count|
+-----+-----------+
|    0|   97691078|
|    1|     724532|
+-----+-----------+



                                                                                

In [46]:
df_vectors.show()

[Stage 63:>                                                         (0 + 1) / 1]

+---------+--------------------+-----+---------+---------+-------+
|       id|             vectors|binds|protein_0|protein_1|protein|
+---------+--------------------+-----+---------+---------+-------+
|193996160|(94,[0,1,2,3,4,5,...|    0|      0.0|      0.0|      2|
|193997410|[360.369000000000...|    0|      0.0|      1.0|      1|
|194001563|[360.369000000000...|    0|      0.0|      0.0|      2|
|194021946|(94,[0,1,2,3,4,5,...|    0|      1.0|      0.0|      0|
|194027118|(94,[0,1,2,3,4,5,...|    0|      1.0|      0.0|      0|
|194027972|[360.369000000000...|    0|      0.0|      0.0|      2|
|194030593|[360.369000000000...|    1|      0.0|      1.0|      1|
|194030875|[360.369000000000...|    0|      0.0|      1.0|      1|
|194032258|[360.369000000000...|    0|      0.0|      1.0|      1|
|194033516|[360.369000000000...|    0|      0.0|      0.0|      2|
|194033893|[360.369000000000...|    0|      0.0|      1.0|      1|
|194034475|[360.369000000000...|    1|      0.0|      1.0|    

                                                                                

In [14]:
df_vectors.groupBy("binds").count().collect()

                                                                                

[Row(binds=0, count=5916), Row(binds=1, count=37)]

In [13]:
df_vectors.count()

                                                                                

5953

In [8]:
full_df_vectors = full_df_vectors.repartition(200)

//////////////////////////////////////////////////////////////////////////////////

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=2, seed=42, featuresCol='vectors', predictionCol="cluster")
model = kmeans.fit(full_df_vectors)

# Get predictions
predictions = model.transform(full_df_vectors)

# Show clusters
predictions.select("cluster", "binds").show()

In [None]:
# Save the model to a directory
model.save("KMeans_model")

# To load the model later
# from pyspark.ml.clustering import KMeansModel
# loaded_model = KMeansModel.load("path_to_save_model")

In [7]:
from pyspark.ml.classification import RandomForestClassifier

rf_model = RandomForestClassifier(featuresCol="vectors", labelCol="binds", numTrees=100, probabilityCol="probability")
rf_model = rf_model.fit(full_df_vectors)
rf_model.save("rf_model")

predictions = rf_model.transform(full_df_vectors)

predictions.select("id", "binds", "prediction", "probability").show(truncate=False)

24/12/30 09:11:07 WARN MemoryStore: Not enough space to cache rdd_38_54 in memory! (computed 582.2 MiB so far)
24/12/30 09:11:07 WARN BlockManager: Persisting block rdd_38_54 to disk instead.
24/12/30 09:11:07 WARN MemoryStore: Not enough space to cache rdd_38_3 in memory! (computed 582.2 MiB so far)
24/12/30 09:11:07 WARN BlockManager: Persisting block rdd_38_3 to disk instead.
24/12/30 09:11:07 WARN MemoryStore: Not enough space to cache rdd_38_28 in memory! (computed 582.2 MiB so far)
24/12/30 09:11:07 WARN BlockManager: Persisting block rdd_38_28 to disk instead.
24/12/30 09:11:07 WARN MemoryStore: Not enough space to cache rdd_38_61 in memory! (computed 582.2 MiB so far)
24/12/30 09:11:07 WARN BlockManager: Persisting block rdd_38_61 to disk instead.
24/12/30 09:11:07 WARN MemoryStore: Not enough space to cache rdd_38_24 in memory! (computed 582.2 MiB so far)
24/12/30 09:11:07 WARN BlockManager: Persisting block rdd_38_24 to disk instead.
24/12/30 09:11:07 WARN MemoryStore: Not en



24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_2 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_22 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_65 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_47 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_5 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_59 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_61 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_52 in memory! (computed 162.4 MiB so far)
24/12/30 09:21:20 WARN MemoryStore: Not enough space to cache rdd_38_32 in memory! (computed 162.4 MiB so far)
24/

+---------+-----+----------+-----------------------------------------+
|id       |binds|prediction|probability                              |
+---------+-----+----------+-----------------------------------------+
|193986560|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986561|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986562|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986563|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986564|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986565|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986566|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986567|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986568|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986569|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|193986570|0    |0.0       |[0.9946306076488763,0.005369392351123681]|
|19398

                                                                                

In [None]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Define the classifier
gbt = GBTClassifier(featuresCol="vectors", labelCol="binds")

# Define the parameter grid with higher maxIter values
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [3, 5, 7])
             .addGrid(gbt.maxIter, [50, 100, 200])  # Higher values for maxIter
             .addGrid(gbt.stepSize, [0.05, 0.1, 0.2])
             .build())

# Define the evaluator using PR curve
evaluator = BinaryClassificationEvaluator(labelCol="binds", metricName="areaUnderPR")

# Define the CrossValidator
cv = CrossValidator(estimator=gbt,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)  # 3-fold cross-validation

# Fit the CrossValidator
cv_model = cv.fit(full_df_vectors)

# Get the best model
best_model = cv_model.bestModel


# Save the best model
best_model.selectave("best_gbt_model")

# Make predictions
predictions = best_model.transform(full_df_vectors)

# Show predictions
predictions.select("id", "binds", "prediction", "probability").show(truncate=False)


In [None]:
from pyspark.ml.classification import GBTClassifier

gbt_model = GBTClassifier(
    featuresCol="vectors", 
    labelCol="binds", 
    maxMemoryInMB=int(1024*3),
    cacheNodeIds=True,
    maxIter=100,
    weightCol="sample_weights",
    minInstancesPerNode=int(450000)
)

gbt_model = gbt_model.fit(full_df_vectors)
gbt_model.save("gbt_model")

predictions = gbt_model.transform(full_df_vectors)

predictions.select("id", "binds", "prediction", "probability").show(truncate=False)

In [None]:
predictions.show()

/////////////////////////////////////////////////////////////////////////////////////

## GBT Classifier

In [None]:
from pyspark.ml.classification import GBTClassifier

gbt_model = GBTClassifier(
    featuresCol="vectors", 
    labelCol="binds", 
    maxMemoryInMB=int(1024*3),
    cacheNodeIds=True,
    maxIter=500
    # weightCol="sample_weights",
    # minInstancesPerNode=int(450000)
).setProbabilityCol('probability')

gbt_model = gbt_model.fit(df_vectors)

In [None]:
gbt_model.save("gbt_model6")

predictions = gbt_model.transform(df_vectors)
full_predictions = gbt_model.transform(full_feat_tok_df_vectors)

predictions.select("id", "binds", "prediction", "probability").show(truncate=False)

In [8]:
full_predictions.select("id", "binds", "prediction", "probability").show(truncate=False)

+----+-----+----------+----------------------------------------+
|id  |binds|prediction|probability                             |
+----+-----+----------+----------------------------------------+
|39  |0    |0.0       |[0.8477757783732623,0.15222422162673765]|
|98  |0    |0.0       |[0.8700648978133694,0.12993510218663062]|
|187 |0    |0.0       |[0.9434702478690092,0.05652975213099076]|
|496 |0    |0.0       |[0.8563478475828599,0.14365215241714013]|
|561 |0    |0.0       |[0.8145391526805389,0.1854608473194611] |
|845 |0    |0.0       |[0.8075347851569995,0.1924652148430005] |
|1297|0    |0.0       |[0.8675385556403445,0.13246144435965546]|
|1733|0    |0.0       |[0.7745868077210596,0.2254131922789404] |
|1918|0    |0.0       |[0.8691851612081525,0.13081483879184752]|
|2451|0    |0.0       |[0.8253939852714554,0.17460601472854465]|
|2639|0    |0.0       |[0.9008653078113289,0.0991346921886711] |
|2745|0    |0.0       |[0.8398572905604071,0.16014270943959286]|
|2806|0    |0.0       |[0

In [21]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType


def get_class_1_probability(probability):
    return float(probability[1])

get_class_1_probability_udf = udf(get_class_1_probability, DoubleType())

predictions = predictions.withColumn("probability_of_class_1", get_class_1_probability_udf(col("probability")))

[Stage 1086:>                                                       (0 + 1) / 1]

+------+--------------------+-----+--------------------+--------------------+----------+----------------------+
|    id|             vectors|binds|       rawPrediction|         probability|prediction|probability_of_class_1|
+------+--------------------+-----+--------------------+--------------------+----------+----------------------+
| 25160|(190,[0,1,2,3,4,5...|    1|[-1.4883805354393...|[0.04848684118280...|       1.0|    0.9515131588171953|
| 48001|(190,[0,1,2,3,4,5...|    0|[1.31129866424674...|[0.93230182134306...|       0.0|   0.06769817865693717|
| 58144|(190,[0,1,2,3,4,5...|    0|[0.38782458996920...|[0.68474165593869...|       0.0|   0.31525834406130615|
|148672|(190,[0,1,2,3,4,5...|    1|[0.51863751912868...|[0.73832388077091...|       0.0|    0.2616761192290853|
|233007|(190,[0,1,2,3,4,5...|    0|[0.70689524273148...|[0.80436311438802...|       0.0|   0.19563688561197012|
|234967|(190,[0,1,2,3,4,5...|    0|[0.82760263799361...|[0.83959331793844...|       0.0|   0.16040668206

                                                                                

In [24]:
full_predictions.show()

+----+--------------------+-----+--------------------+--------------------+----------+
|  id|             vectors|binds|       rawPrediction|         probability|prediction|
+----+--------------------+-----+--------------------+--------------------+----------+
|  39|(190,[0,1,2,3,4,5...|    0|[0.85863080617962...|[0.84777577837326...|       0.0|
|  98|(190,[0,1,2,3,4,5...|    0|[0.95076634609825...|[0.87006489781336...|       0.0|
| 187|(190,[0,1,2,3,4,5...|    0|[1.40739887228880...|[0.94347024786900...|       0.0|
| 496|(190,[0,1,2,3,4,5...|    0|[0.89264094426188...|[0.85634784758285...|       0.0|
| 561|(190,[0,1,2,3,4,5...|    0|[0.73988935118472...|[0.81453915268053...|       0.0|
| 845|(190,[0,1,2,3,4,5...|    0|[0.71703534823892...|[0.80753478515699...|       0.0|
|1297|(190,[0,1,2,3,4,5...|    0|[0.93968416912939...|[0.86753855564034...|       0.0|
|1733|(190,[0,1,2,3,4,5...|    0|[0.61719730382824...|[0.77458680772105...|       0.0|
|1918|(190,[0,1,2,3,4,5...|    0|[0.9468866

In [None]:
# Calculate PR AUC (Area Under Precision-Recall Curve)
pr_auc_evaluator = BinaryClassificationEvaluator(labelCol="binds", rawPredictionCol="rawPrediction", 
                                                 metricName="areaUnderPR")
pr_auc = pr_auc_evaluator.evaluate(predictions)
print(f"PR AUC (approx MAP): {pr_auc}")

pr_auc = pr_auc_evaluator.evaluate(full_predictions)
print(f"PR AUC (approx MAP): {pr_auc}")

In [None]:
TP = full_predictions.filter((col("binds") == 1) & (col("prediction") == 1)).count()
FP = full_predictions.filter((col("binds") == 0) & (col("prediction") == 1)).count()
TN = full_predictions.filter((col("binds") == 0) & (col("prediction") == 0)).count()
FN = full_predictions.filter((col("binds") == 1) & (col("prediction") == 0)).count()

import seaborn as sns
import matplotlib.pyplot as plt

print(TP, FP, TN, FN)
confusion_matrix = [
    [TN, FP],
    [FN, TP]
]

plt.figure(figsize=(5, 3))
sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [25]:
# Calculate Precision and Recall
precision = TP / (TP + FP) if (TP + FP) > 0 else 0  # Precision = TP / (TP + FP)
recall = TP / (TP + FN) if (TP + FN) > 0 else 0      # Recall = TP / (TP + FN)

precision, recall

(0.04537597568746185, 0.8295679115620672)

# Spark XGB Classifier

In [3]:
full_feat_tok_df_vectors = spark.read.format('parquet').load('full_feat_tok_df_vectors.parquet')
print(full_feat_tok_df_vectors.count())
full_feat_tok_df_vectors.show()

                                                                                

295246830
+----+--------------------+-----+
|  id|             vectors|binds|
+----+--------------------+-----+
|  39|(190,[0,1,2,3,4,5...|    0|
|  98|(190,[0,1,2,3,4,5...|    0|
| 187|(190,[0,1,2,3,4,5...|    0|
| 496|(190,[0,1,2,3,4,5...|    0|
| 561|(190,[0,1,2,3,4,5...|    0|
| 845|(190,[0,1,2,3,4,5...|    0|
|1297|(190,[0,1,2,3,4,5...|    0|
|1733|(190,[0,1,2,3,4,5...|    0|
|1918|(190,[0,1,2,3,4,5...|    0|
|2451|(190,[0,1,2,3,4,5...|    0|
|2639|(190,[0,1,2,3,4,5...|    0|
|2745|(190,[0,1,2,3,4,5...|    0|
|2806|(190,[0,1,2,3,4,5...|    0|
|2832|(190,[0,1,2,3,4,5...|    0|
|3322|(190,[0,1,2,3,4,5...|    0|
|3751|(190,[0,1,2,3,4,5...|    0|
|3974|(190,[0,1,2,3,4,5...|    0|
|4188|(190,[0,1,2,3,4,5...|    0|
|4296|(190,[0,1,2,3,4,5...|    0|
|4733|(190,[0,1,2,3,4,5...|    0|
+----+--------------------+-----+
only showing top 20 rows



In [4]:
class_counts = full_feat_tok_df_vectors.groupBy("binds").count().collect()
total_count = sum(row["count"] for row in class_counts)
class_weights = {row["binds"]: total_count / row["count"] for row in class_counts}
print(class_counts, total_count, class_weights)



[Row(binds=0, count=293656924), Row(binds=1, count=1589906)] 295246830 {0: 1.005414161458696, 1: 185.70080872705682}


                                                                                

In [5]:
full_feat_tok_df_vectors = full_feat_tok_df_vectors.withColumn(
    "sample_weights",
    when(col("binds") == 0, class_weights[0])
    .when(col("binds") == 1, class_weights[1])
)
full_feat_tok_df_vectors.show()

+----+--------------------+-----+-----------------+
|  id|             vectors|binds|   sample_weights|
+----+--------------------+-----+-----------------+
|  39|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|  98|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
| 187|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
| 496|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
| 561|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
| 845|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|1297|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|1733|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|1918|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|2451|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|2639|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|2745|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|2806|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|2832|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|3322|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|3751|(190,[0,1,2,3,4,5...|    0|1.005414161458696|
|3974|(190,[

In [27]:
xgb_classifier = SparkXGBClassifier(
    features_col="vectors", 
    label_col="binds",
    weight_col="sample_weights",
    max_depth=6
)

In [None]:
xgb_model = xgb_classifier.fit(full_feat_tok_df_vectors)

2025-01-01 23:06:34,405 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'device': 'cpu', 'max_depth': 6, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2025-01-01 23:25:01,038 INFO XGBoost-PySpark: _train_booster Training on CPUs 1]
[23:25:02] Task 0 got rank 0
[Stage 25:>                                                         (0 + 1) / 1]

In [9]:
model_path = "checkpoints/_3_XGB_feat_tok"
xgb_model.save(model_path)

                                                                                

In [None]:
predictions = xgb_classifier.transform(full_feat_tok_df_vectors)
predictions.show()

In [None]:
# Calculate PR AUC (Area Under Precision-Recall Curve)
pr_auc_evaluator = BinaryClassificationEvaluator(labelCol="binds", rawPredictionCol="rawPrediction", 
                                                 metricName="areaUnderPR")
pr_auc = pr_auc_evaluator.evaluate(predictions)
print(f"PR AUC (approx MAP): {pr_auc}")