In [None]:
import pandas as pd
import numpy as np
import os
os.environ['KAGGLE_USERNAME'] = 'aashish31476'
os.environ['KAGGLE_KEY'] = '306e7cb470122dd3f86d4c7db6d197a9'

import dask
# dask.config.set(scheduler='threads')
import dask.dataframe as dd

from collections import Counter
import re

import joblib
from tqdm.auto import trange, tqdm
from IPython.display import display

import torch
from torch.amp import autocast
from torch.utils.data import IterableDataset, DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
print(device)

protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}

In [1]:
!kaggle competitions files leash-BELKA

name                    size  creationDate         
---------------------  -----  -------------------  
sample_submission.csv   22MB  2024-03-20 23:16:46  
test.csv               297MB  2024-03-20 23:16:46  
train.parquet            3GB  2024-03-20 23:16:46  
train.csv               50GB  2024-03-20 23:16:46  
test.parquet            29MB  2024-03-20 23:16:46  


In [2]:
# !kaggle competitions download leash-BELKA -f train.parquet
# !kaggle competitions download leash-BELKA -f test.parquet
!kaggle competitions download leash-BELKA -f sample_submission.csv
# !unzip train.parquet.zip
# !unzip test.parquet.zip
# !pip install dask[dataframe]

Downloading sample_submission.csv.zip to /home/23m1521/ashish/kaggle
 80%|██████████████████████████████▎       | 3.00M/3.76M [00:01<00:00, 2.38MB/s]
100%|██████████████████████████████████████| 3.76M/3.76M [00:01<00:00, 2.17MB/s]


In [None]:
def make_vocab(dff, update=None):
    letter_counts = Counter(update) if update else Counter()
    l = dff.drop(columns=['id', 'protein_name', 'binds']).to_numpy().flatten()
    for text in tqdm(l, desc='making vocab'):
        text = re.sub(r'[\d()\[\]{}]+', '', text)
        # letter_counts.update(char for char in text)
        letter_counts.update(text)
    return dict(letter_counts)

def make_counter(l):
    letter_counts = Counter()
    for text in l:
        text = re.sub(r'[\d()\[\]{}]+', '', text)
        letter_counts.update(char for char in text)
    return dict(letter_counts)

def allign_counter_to_vocab(counter, vocab):
    temp = {}
    for i in range(len(vocab.keys())):
        if list(vocab.keys())[i] in counter.keys():
            temp[list(vocab.keys())[i]] = counter[list(vocab.keys())[i]]
        else:
            temp[list(vocab.keys())[i]] = 0
    return temp

def make_features(df, vocab):
    id = df['id'].to_numpy()
    smiles = df.drop(columns=['id', 'protein_name', 'binds']).to_numpy()
    protein = df['protein_name'].to_numpy()
    y = df['binds'].to_numpy()

    df_features = {'id':[], 'bb1':[], 'bb2':[], 'bb3':[], 'molecule':[], 'protein':[], 'y':[]}
    for i in trange(len(id), desc='making features'):
        df_features['id'].append(id[i])

        counter = make_counter(smiles[i][0])
        df_features['bb1'].append(allign_counter_to_vocab(counter, vocab))

        counter = make_counter(smiles[i][1])
        df_features['bb2'].append(allign_counter_to_vocab(counter, vocab))

        counter = make_counter(smiles[i][2])
        df_features['bb3'].append(allign_counter_to_vocab(counter, vocab))

        counter = make_counter(smiles[i][3])
        df_features['molecule'].append(allign_counter_to_vocab(counter, vocab))

        df_features['protein'].append(protein[i])
        df_features['y'].append(y[i])

    return df_features

def check_df_allignment(dff_features, vocab):
    flag = True
    for i in trange(len(dff_features['bb1'])):
        if dff_features['bb1'][i].keys() != vocab.keys():
            print(dff_features['bb1'][i].keys())
            print(vocab.keys())
            flag = False
            break
    return flag


def df_vectors(dff_features, vocab, protein_map):
    op = np.empty((100,7))
    for i in trange(0,len(dff_features['id']),100, desc='Making vector df'):
        df = pd.DataFrame({
            'id': dff_features['id'][i:i+100],
            'bb1': dff_features['bb1'][i:i+100],
            'bb2': dff_features['bb2'][i:i+100],
            'bb3': dff_features['bb3'][i:i+100],
            'molecule': dff_features['molecule'][i:i+100],
            'protein': dff_features['protein'][i:i+100],
            'y': dff_features['y'][i:i+100]
        })

        df.bb1 = df.bb1.apply(lambda x: list(x.values()))
        df.bb2 = df.bb2.apply(lambda x: list(x.values()))
        df.bb3 = df.bb3.apply(lambda x: list(x.values()))
        df.molecule = df.molecule.apply(lambda x: list(x.values()))
        df.protein = df.protein.map(protein_map)

        op = np.concatenate((op, df.to_numpy()))

    return op[100:]


def process_row(row, protein_map=protein_map):
    return {
             'id': row['id'],
             'bb1': list(allign_counter_to_vocab(make_counter(row['buildingblock1_smiles']), vocab).values()),
             'bb2': list(allign_counter_to_vocab(make_counter(row['buildingblock2_smiles']), vocab).values()),
             'bb3': list(allign_counter_to_vocab(make_counter(row['buildingblock3_smiles']), vocab).values()),
             'molecule': list(allign_counter_to_vocab(make_counter(row['molecule_smiles']), vocab).values()),
             'protein': protein_map[row['protein_name']],
             'y': row['binds']
        }

def split(path):
    dask_df = dd.read_parquet(path)
    train_fraction = 0.8
    train_df, val_df = dask_df.random_split([train_fraction, 1 - train_fraction], random_state=42)
    print(f"Train size: {train_df.shape[0].compute()}")
    print(f"Validation size: {val_df.shape[0].compute()}")
    train_df.to_parquet("train_split.parquet", write_index=False)
    val_df.to_parquet("val_split.parquet", write_index=False)

In [None]:
# df = dd.read_parquet("/kaggle/input/leash-BELKA/train.parquet")

# vocab = None

# for c, partition in tqdm(enumerate(df.to_delayed()), total=len(df.to_delayed())):
#     chunk = partition.compute()  # Compute the current partition
#     vocab = make_vocab(chunk) if vocab is None else make_vocab(chunk, vocab)  # Update vocab
#     joblib.dump(vocab, 'vocab.joblib')  # Save intermediate vocab


# print(vocab)

In [None]:
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786, 'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132, 'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468, 'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}

In [None]:
# vocab = joblib.load('vocab.joblib')
# vocab

In [None]:
class ParquetDataset(IterableDataset):
    def __init__(self, dask_df, vocab=vocab, protein_map=protein_map, transform=None):
        self.dask_df = dask_df
        self.partitions = self.dask_df.to_delayed()
        self.vocab = vocab
        self.protein_map = protein_map
        self.transform = transform
        

    def __iter__(self):
        for partition in self.partitions:
            chunk = partition.compute()
            for _, row in chunk.iterrows():
                yield self.process_row(row)

    def process_row(self, row):
        data = {
            'id': row['id'],
            'bb1': list(allign_counter_to_vocab(make_counter(row['buildingblock1_smiles']), self.vocab).values()),
            'bb2': list(allign_counter_to_vocab(make_counter(row['buildingblock2_smiles']), self.vocab).values()),
            'bb3': list(allign_counter_to_vocab(make_counter(row['buildingblock3_smiles']), self.vocab).values()),
            'molecule': list(allign_counter_to_vocab(make_counter(row['molecule_smiles']), self.vocab).values()),
            'protein': self.protein_map[row['protein_name']],
            'y': row['binds']
        }
        if self.transform:
            data = self.transform(data)
        return data


def custom_collate_fn(batch):
    ids = [sample['id'] for sample in batch]
    bb1 = torch.tensor([sample['bb1'] for sample in batch], dtype=torch.float32)
    bb2 = torch.tensor([sample['bb2'] for sample in batch], dtype=torch.float32)
    bb3 = torch.tensor([sample['bb3'] for sample in batch], dtype=torch.float32)
    molecule = torch.tensor([sample['molecule'] for sample in batch], dtype=torch.float32)
    protein = torch.tensor([sample['protein'] for sample in batch], dtype=torch.long)
    y = torch.tensor([sample['y'] for sample in batch], dtype=torch.float32)

    combined_features = torch.cat([bb1, bb2, bb3, molecule, protein.unsqueeze(1)], dim=1)

    return (
        ids,
        combined_features,
        y
    )

In [None]:
dask_df = dd.read_parquet("/kaggle/input/leash-BELKA/train.parquet")

df_len = dask_df.shape[0].compute()
print(f"Number of rows: {df_len}")

df_dataset = ParquetDataset(dask_df)

In [None]:
output_file = 'full_train_dataset.parquet'
chunk_size = 1000000
chunk_data = []

file_exists = os.path.exists(output_file)

for i, data in tqdm(enumerate(df_dataset), total=df_len):
    chunk_data.append(data)
    
    if (i + 1) % chunk_size == 0:
        df = pd.DataFrame(chunk_data)
        
        if file_exists:
            existing_df = pd.read_parquet(output_file, engine='pyarrow')
            updated_df = pd.concat([existing_df, df], ignore_index=True)
            updated_df.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)
        else:
            df.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)
            file_exists = True
        
        file_size = os.path.getsize(output_file) / (1024 * 1024)
        print(f"File size after saving chunk {(i + 1) // chunk_size}: {file_size:.2f} MB")
        
        chunk_data = []

if chunk_data:
    df = pd.DataFrame(chunk_data)
    if file_exists:
        existing_df = pd.read_parquet(output_file, engine='pyarrow')
        updated_df = pd.concat([existing_df, df], ignore_index=True)
        updated_df.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)
    else:
        df.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)
    
    file_size = os.path.getsize(output_file) / (1024 * 1024)
    print(f"Final file size: {file_size:.2f} MB")

In [None]:
# df = dd.read_parquet("/kaggle/input/leash-BELKA/train.parquet")
# # df = df.shuffle('binds')
# for i, partition in tqdm(enumerate(df.partitions)):
#     chunk = partition.compute()
#     print(1,chunk.binds.value_counts().to_dict())
#     # break

In [None]:
# dask_df = dd.read_parquet("/kaggle/input/leash-BELKA/train.parquet")
# length = dask_df.shape[0]
# print(f"Number of rows: {length.compute()}")
# dataset = ParquetDataset(dask_df)

In [None]:
# train_dask_df = dd.read_parquet("/kaggle/working/train_split.parquet")
# val_dask_df = dd.read_parquet("/kaggle/working/val_split.parquet")

# train_len = train_dask_df.shape[0].compute()
# val_len = val_dask_df.shape[0].compute()
# print(f"Number of rows train: {train_len}")
# print(f"Number of rows val: {val_len}")

# train_dataset = ParquetDataset(train_dask_df)
# val_dataset = ParquetDataset(val_dask_df)

In [None]:
# !rm -rf train_dataset.parquet

In [None]:
# batch_size = int(1024*100)
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=custom_collate_fn, pin_memory=True, pin_memory_device=device)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=custom_collate_fn, pin_memory=True, pin_memory_device=device)

# n_train_batchs = int(train_len/batch_size)
# n_val_batchs = int(val_len/batch_size)
# print(n_train_batchs)
# print(n_val_batchs)

# for batch in tqdm(train_dataloader, total=n_train_batchs):
#     pass
#     break

In [None]:
# class BinaryClassifier(nn.Module):
#     def __init__(self, input_dim):
#         super(BinaryClassifier, self).__init__()
#         self.fc = nn.Sequential(
#             nn.Linear(input_dim, 200),
#             nn.ReLU(),
#             nn.Linear(200, 200),
#             nn.ReLU(),
#             nn.Linear(200, 1),
#             # nn.Sigmoid()
#         )

#     def forward(self, x):
#         return self.fc(x)

In [None]:
# def train_model(
#     model, 
#     train_loader, 
#     val_loader, 
#     criterion, 
#     optimizer, 
#     device=device, 
#     num_epochs=10, 
#     accumulation_steps=4, 
#     checkpoint_path="checkpoint.pth"
# ):
#     # Initialize tracking variables
#     best_val_accuracy = 0.0
#     metrics = {"train_loss": [], "train_accuracy": [], "val_loss": [], "val_accuracy": []}
#     start_epoch = 0

#     # Resume training if checkpoint exists
#     if os.path.exists(checkpoint_path):
#         checkpoint = torch.load(checkpoint_path)
#         model.load_state_dict(checkpoint["model_state"])
#         optimizer.load_state_dict(checkpoint["optimizer_state"])
#         best_val_accuracy = checkpoint["best_val_accuracy"]
#         start_epoch = checkpoint["epoch"] + 1
#         print(f"Resuming training from epoch {start_epoch}...")

#     for epoch in trange(start_epoch, num_epochs):
#         # Training Phase
#         model.train()
#         total_loss = 0
#         correct = 0
#         total = 0
#         optimizer.zero_grad()
#         batch_count = 0
        
#         for batch_idx, (_, inputs, labels) in tqdm(enumerate(train_loader), total=n_train_batchs):
#             inputs, labels = inputs.to(device), labels.to(device)
#             with autocast(device):
#                 outputs = model(inputs).squeeze()
#                 loss = criterion(outputs, labels)
#             total_loss += loss.item()
#             batch_count += 1

#             loss.backward()
            
#             if (batch_idx + 1) % accumulation_steps == 0:
#                 optimizer.step()
#                 optimizer.zero_grad()
            
#             predictions = (outputs.sigmoid() > 0.5).float()
#             correct += (predictions == labels).sum().item()
#             total += labels.size(0)
        
#         train_loss = total_loss / batch_count
#         train_accuracy = correct / total

#         # Validation Phase
#         model.eval()
#         val_loss = 0
#         val_correct = 0
#         val_total = 0
#         val_batch_count = 0
        
#         with torch.no_grad():  # Disable gradient computation
#             for _, inputs, labels in tqdm(val_loader, total=n_val_batchs):
#                 inputs, labels = inputs.to(device), labels.to(device)
#                 with autocast(device):
#                     outputs = model(inputs).squeeze()
#                     loss = criterion(outputs, labels)
#                 val_loss += loss.item()
#                 val_batch_count += 1
                
#                 predictions = (outputs.sigmoid() > 0.5).float()
#                 val_correct += (predictions == labels).sum().item()
#                 val_total += labels.size(0)
        
#         val_loss /= val_batch_count
#         val_accuracy = val_correct / val_total

#         # Save metrics
#         metrics["train_loss"].append(train_loss)
#         metrics["train_accuracy"].append(train_accuracy)
#         metrics["val_loss"].append(val_loss)
#         metrics["val_accuracy"].append(val_accuracy)

#         # Check if validation accuracy improves
#         if val_accuracy > best_val_accuracy:
#             best_val_accuracy = val_accuracy
#             torch.save(
#                 {
#                     "model_state": model.state_dict(),
#                     "optimizer_state": optimizer.state_dict(),
#                     "best_val_accuracy": best_val_accuracy,
#                     "epoch": epoch,
#                 },
#                 checkpoint_path,
#             )
#             print(f"New best model saved at epoch {epoch+1}")

#         # Print metrics
#         print(f"Epoch {epoch+1}/{num_epochs}")
#         print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
#         print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

#     return metrics

In [None]:
# model = BinaryClassifier(97).to(device)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs.")
#     model = nn.DataParallel(model)

# # criterion = nn.BCELoss()
# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# _, x, y = batch
# x, y = x.to(device), y.to(device)

# for batch in tqdm(train_dataloader, total=n_train_batchs):
#     _, x, y = batch
#     x, y = x.to(device), y.to(device)
#     model(x)

In [None]:
# train_model(
#     model=model, 
#     train_loader=train_dataloader, 
#     val_loader=val_dataloader, 
#     criterion=criterion, 
#     optimizer=optimizer,
#     device=device,
#     num_epochs=10,
#     accumulation_steps=10
# )

In [None]:
# 7244/7381204 [01:53<30:39:04, 66.83it/s]
# 136/230662 [01:04<28:19:49,  2.26it/s]

In [None]:
import pandas as pd
import numpy as np
import os
import dask
from multiprocessing.pool import Pool
# dask.config.set(pool=Pool(20))
import dask.dataframe as dd
from collections import Counter
import re
from tqdm.auto import tqdm
import torch
from torch.utils.data import IterableDataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}

# Optimized Functions
def make_vocab(dff, update=None):
    letter_counts = Counter(update) if update else Counter()
    l = dff.drop(columns=['id', 'protein_name', 'binds']).to_numpy().flatten()
    l = np.char.replace(l, r'[\d()\[\]{}]+', '', regex=True)
    letter_counts.update(''.join(l))
    return dict(letter_counts)

def make_counter(l):
    l = re.sub(r'[\d()\[\]{}]+', '', ''.join(l))
    return dict(Counter(l))

def allign_counter_to_vocab(counter, vocab):
    return {key: counter.get(key, 0) for key in vocab.keys()}

def batch_process_partition(dask_df, vocab, protein_map):
    # Apply transformations using Dask's map for parallelism
    dask_df['bb1'] = dask_df['buildingblock1_smiles'].map(
        lambda x: allign_counter_to_vocab(make_counter([x]), vocab),
        meta=(None, 'object')  # Specify metadata for Dask operations
    )
    dask_df['bb2'] = dask_df['buildingblock2_smiles'].map(
        lambda x: allign_counter_to_vocab(make_counter([x]), vocab),
        meta=(None, 'object')
    )
    dask_df['bb3'] = dask_df['buildingblock3_smiles'].map(
        lambda x: allign_counter_to_vocab(make_counter([x]), vocab),
        meta=(None, 'object')
    )
    dask_df['molecule'] = dask_df['molecule_smiles'].map(
        lambda x: allign_counter_to_vocab(make_counter([x]), vocab),
        meta=(None, 'object')
    )
    dask_df['protein'] = dask_df['protein_name'].map(
        protein_map.get, meta=(None, 'int64')  # Map protein names to integers
    )

    # Drop unnecessary columns
    dask_df = dask_df.drop(columns=[
        'buildingblock1_smiles', 'buildingblock2_smiles',
        'buildingblock3_smiles', 'molecule_smiles', 'protein_name'
    ])

    return dask_df


# Optimized Dataset Class
class OptimizedParquetDataset(IterableDataset):
    def __init__(self, dask_df, vocab, protein_map, batch_size=1000):
        self.dask_df = dask_df
        self.partitions = self.dask_df.to_delayed()
        self.vocab = vocab
        self.protein_map = protein_map
        self.batch_size = batch_size

    def __iter__(self):
        for partition in self.partitions:
            yield batch_process_partition(partition, self.vocab, self.protein_map, self.batch_size)

# Efficient File Writing
def write_to_parquet(data, output_file):
    if os.path.exists(output_file):
        existing_df = pd.read_parquet(output_file, engine='pyarrow')
        updated_df = pd.concat([existing_df, data], ignore_index=True)
    else:
        updated_df = data
    updated_df.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)


In [None]:

# Main Logic
batch_size = 1
output_file = 'full_train_dataset.parquet'

vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786, 'N': 2469595230}
dask_df = dd.read_parquet("/kaggle/input/leash-BELKA/train.parquet")
# dask_df = dask_df.repartition(npartitions=20)

# Calculate total batches for tqdm
total_rows = dask_df.shape[0].compute()
total_batches = (total_rows + batch_size - 1) // batch_size

df_dataset = OptimizedParquetDataset(dask_df, vocab, protein_map, batch_size=batch_size)

for batch in tqdm(df_dataset, total=total_batches, desc="Processing Batches"):
    write_to_parquet(batch, output_file)


In [None]:
# Load Dask DataFrame
dask_df = dd.read_parquet("/kaggle/input/leash-BELKA/train.parquet")

# Process DataFrame using the optimized function
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786, 'N': 2469595230}
processed_df = batch_process_partition(dask_df, vocab, protein_map)

# Persist changes to a parquet file
output_file = 'processed_train_dataset.parquet'
processed_df.to_parquet(output_file, engine='pyarrow', compression='snappy', write_index=False)

print("Processing complete. Data saved to:", output_file)

////////////////////////////////////////////////////////////////////////////////////

In [1]:
import pandas as pd
import numpy as np
import os
import gc
import dask
from multiprocessing.pool import Pool
import dask.dataframe as dd
from collections import Counter
import re
from tqdm.auto import tqdm, trange

import torch
from torch.utils.data import IterableDataset, DataLoader, WeightedRandomSampler
from torch.amp import autocast
import torch.nn as nn
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

train_len = 295246830
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786, 
         'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132, 
         'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468, 
         'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}
datadir = '/home/23m1521/ashish/kaggle/chunks_output'

cpu


In [2]:
def process_df(df):
    processed_data = []
    for _, row in df.iterrows():
        features_list = [
            np.array(row['bb1']).flatten(),
            np.array(row['bb2']).flatten(),
            np.array(row['bb3']).flatten(),
            np.array(row['molecule']).flatten(),
            np.array(row['protein']).flatten()
        ]
        features_array = np.concatenate(features_list).astype(np.float32)
        features = torch.tensor(features_array, dtype=torch.float32)
        data = {
            'id': torch.tensor(row['id'], dtype=torch.long),
            'features': features,
            'y': torch.tensor(row['y'], dtype=torch.float32)
        }
        processed_data.append(data)    
    return processed_data

import torch
import numpy as np
from pyspark.sql import Row

def process_spark_df(spark_df):
    def process_row(row):
        features_list = [
            np.array(row.bb1).flatten(),
            np.array(row.bb2).flatten(),
            np.array(row.bb3).flatten(),
            np.array(row.molecule).flatten(),
            np.array(row.protein).flatten()
        ]
        features_array = np.concatenate(features_list).astype(np.float32)
        features = torch.tensor(features_array, dtype=torch.float32)
        
        return Row(
            id=torch.tensor(row.id, dtype=torch.long),
            features=features,
            y=torch.tensor(row.y, dtype=torch.float32)
        )

    processed_rdd = spark_df.rdd.map(process_row)
    return processed_rdd.collect()


In [3]:
from pyspark.sql import SparkSession
import numpy as np
import torch

# Initialize Spark Session with configurations
spark = SparkSession.builder.appName(
    "IncreaseMemory").config(
        "spark.driver.memory", "16g").config(
            "spark.executor.memory", "16g").config(
                "spark.executor.instances", "4").config(
                    "spark.executor.cores", "4").config(
                        "spark.driver.maxResultSize", "4g").getOrCreate()

# Read Parquet File
df = spark.read.parquet("/home/23m1521/ashish/kaggle/chunks_output")

# Filter Data
filtered_df = df.filter(df['y'] == '1')

# Partition-wise processing
def process_partition(iterator):
    results = []
    for row in iterator:
        features_list = [
            np.array(row.bb1).flatten(),
            np.array(row.bb2).flatten(),
            np.array(row.bb3).flatten(),
            np.array(row.molecule).flatten(),
            np.array(row.protein).flatten()
        ]
        features_array = np.concatenate(features_list).astype(np.float32)
        features = torch.tensor(features_array, dtype=torch.float32)
        results.append({
            'id': torch.tensor(row.id, dtype=torch.long),
            'features': features,
            'y': torch.tensor(row.y, dtype=torch.float32)
        })
    return iter(results)

# Apply processing function to each partition
processed_rdd = filtered_df.rdd.mapPartitions(process_partition)
processed_df = spark.createDataFrame(processed_rdd)

# Collect processed data if necessary
# processed_data = processed_rdd.take(1000)  # Take only a subset for testing
processed_df.write.mode("overwrite").parquet("/home/23m1521/ashish/kaggle/processed_filtered_output")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 01:04:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

AnalysisException: Datasource does not support writing empty or nested empty schemas. Please make sure the data schema has at least one or more column(s).

In [5]:
processed_df.show()

                                                                                

+--------+---+---+
|features| id|  y|
+--------+---+---+
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
|      {}| {}| {}|
+--------+---+---+
only showing top 20 rows



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(
    "IncreaseMemory").config(
        "spark.driver.memory", "16g").config(
            "spark.executor.memory", "16g").config(
                "spark.executor.instances", "4").config(
                    "spark.executor.cores", "4").config(
                        "spark.driver.maxResultSize", "4g").getOrCreate()

spark = SparkSession.builder.appName("LargeParquet").getOrCreate()

# df = spark.read.parquet("/home/23m1521/ashish/kaggle/chunks_output")
df = spark.read.parquet("/home/23m1521/ashish/kaggle/one.parquet")

# filtered_df = df.filter(df['y'] == '1')

# filtered_df.write.parquet("one.parquet")
# filtered_df.show()
# chunk = filtered_df.limit(1000).toPandas()

pdf = process_spark_df(df)
pdf

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 01:10:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/16 01:10:14 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [1]:
pdf

NameError: name 'pdf' is not defined

In [3]:
import pyarrow.parquet as pq
import pandas as pd
import torch
import numpy as np

def process_df(df):
    processed_data = []
    for _, row in df.iterrows():
        features_list = [
            np.array(row['bb1']).flatten(),
            np.array(row['bb2']).flatten(),
            np.array(row['bb3']).flatten(),
            np.array(row['molecule']).flatten(),
            np.array(row['protein']).flatten()
        ]
        features_array = np.concatenate(features_list).astype(np.float32)
        features = torch.tensor(features_array, dtype=torch.float32)
        data = {
            'id': torch.tensor(row['id'], dtype=torch.long),
            'features': features,
            'y': torch.tensor(row['y'], dtype=torch.float32)
        }
        processed_data.append(data)    
    return processed_data


chunk_paths = [os.path.join(datadir, i) for i in os.listdir(datadir)]

one_list = []
zero_list = []

for chunk_path in tqdm(chunk_paths, total=len(chunk_paths)):
    parquet_file = pq.ParquetFile(chunk_path)
    for batch in tqdm(parquet_file.iter_batches(batch_size=10000), total=100, leave=False):
        df = batch.to_pandas()
        zero_list.append(process_df(df[df['y'] == 0]))
        one_list.append(process_df(df[df['y'] == 1]))

        del df, batch
    del parquet_file

  0%|          | 0/296 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
1000000/10000

100.0

In [17]:
del df, batch

In [18]:
import gc
gc.collect()

38

In [21]:
from memory_profiler import memory_usage
print(memory_usage())

import psutil
print(psutil.virtual_memory())

[3072.61328125]
svmem(total=134855827456, available=126720245760, percent=6.0, used=7043133440, free=124613799936, active=1592913920, inactive=6840803328, buffers=52248576, cached=3146645504, shared=315392, slab=1397915648)


In [13]:
filtered_df

[{'id': tensor(258999464),
  'features': tensor([ 7.,  0.,  2.,  1.,  2.,  4.,  1., 18.,  1.,  0.,  1.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.,
           0.,  0.,  2.,  6.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  1.,  0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
           0.,  0., 14.,  0.,  2.,  1.,  1.,  1.,  5., 15.,  0.,  0.,  1.,  0.,
           0.,  3.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  3.]),
  'y': tensor(1.)},
 {'id': tensor(258999717),
  'features': tensor([ 7.,  0.,  2.,  1.,  2.,  4.,  1., 18.,  1.,  0.,  1.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.,
           0.,  0.,  2.,  6.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  1.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  1.,  1.,  6.,
           0.,  0.,  2., 

In [3]:
csv = dd.read_csv('/home/23m1521/ashish/kaggle/train.csv.zip')
csv

Unnamed: 0_level_0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,int64,string,string,string,string,string,int64
,...,...,...,...,...,...,...


In [2]:
!pip install pyspark[sql]

Collecting pyspark[sql]
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark[sql])
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840629 sha256=f0f693e3b6e4c189740917f0ac410792d334c0769f26a6eaf1de9f3a5e22d8b6
  Stored in directory: /home/23m1521/.cache/pip/wheels/07/a0/a3/d24c94bf043ab5c7e38c30491199a2a11fef8d2584e6df7fb7
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.3


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(
    "IncreaseMemory").config(
        "spark.driver.memory", "100g").config(
            "spark.executor.memory", "100g").config(
                "spark.executor.instances", "10").config(
                    "spark.executor.cores", "10").getOrCreate()

spark = SparkSession.builder.appName("LargeParquet").getOrCreate()

df = spark.read.parquet("/home/23m1521/ashish/kaggle/chunks_output")

filtered_df = df.filter(df['y'] == '1')

filtered_df.show()
# chunk = filtered_df.limit(1000).toPandas()

process_spark_df(filtered_df)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 00:44:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/16 00:44:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+---------+--------------------+--------------------+--------------------+--------------------+-------+---+
|       id|                 bb1|                 bb2|                 bb3|            molecule|protein|  y|
+---------+--------------------+--------------------+--------------------+--------------------+-------+---+
|107000296|[6, 0, 2, 1, 2, 4...|[1, 0, 0, 0, 0, 1...|[1, 0, 0, 0, 0, 0...|[5, 0, 2, 1, 1, 2...|      2|  1|
|107000410|[6, 0, 2, 1, 2, 4...|[1, 0, 0, 0, 0, 1...|[0, 0, 0, 1, 1, 1...|[4, 0, 2, 2, 2, 3...|      2|  1|
|107001558|[6, 0, 2, 1, 2, 4...|[2, 1, 0, 0, 0, 1...|[2, 0, 0, 0, 1, 1...|[6, 1, 2, 1, 2, 3...|      1|  1|
|107001581|[6, 0, 2, 1, 2, 4...|[2, 1, 0, 0, 0, 1...|[13, 0, 0, 0, 0, ...|[17, 1, 2, 1, 1, ...|      3|  1|
|107002105|[6, 0, 2, 1, 2, 4...|[2, 1, 0, 0, 0, 1...|[2, 0, 0, 0, 0, 1...|[7, 1, 2, 1, 1, 3...|      2|  1|
|107002360|[6, 0, 2, 1, 2, 4...|[2, 1, 0, 0, 0, 1...|[1, 0, 0, 1, 0, 0...|[6, 1, 2, 2, 1, 2...|      2|  1|
|107002480|[6, 0, 2, 1, 2, 4

In [4]:
process_spark_df(filtered_df)

24/12/16 00:45:30 ERROR TaskSetManager: Total size of serialized results of 28 tasks (1028.3 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
24/12/16 00:45:30 WARN TaskSetManager: Lost task 27.0 in stage 2.0 (TID 29) (tirandaz.ieor.iitb.ac.in executor driver): TaskKilled (Tasks result size has exceeded maxResultSize)
24/12/16 00:45:30 ERROR TaskSetManager: Total size of serialized results of 29 tasks (1067.2 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
24/12/16 00:45:30 WARN TaskSetManager: Lost task 1.0 in stage 2.0 (TID 3) (tirandaz.ieor.iitb.ac.in executor driver): TaskKilled (Tasks result size has exceeded maxResultSize)
24/12/16 00:45:30 WARN TaskSetManager: Lost task 35.0 in stage 2.0 (TID 37) (tirandaz.ieor.iitb.ac.in executor driver): TaskKilled (Stage cancelled: Job aborted due to stage failure: Total size of serialized results of 28 tasks (1028.3 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB))
24/12/16 00:45:30 WARN TaskSetManager: Lost

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Total size of serialized results of 28 tasks (1028.3 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


24/12/16 00:45:33 WARN PythonRunner: Detected deadlock while completing task 31.0 in stage 2 (TID 33): Attempting to kill Python Worker
24/12/16 00:45:34 WARN TaskSetManager: Lost task 31.0 in stage 2.0 (TID 33) (tirandaz.ieor.iitb.ac.in executor driver): TaskKilled (Stage cancelled: Job aborted due to stage failure: Total size of serialized results of 28 tasks (1028.3 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB))


In [13]:
process_df(chunk)

[{'id': tensor(107000296),
  'features': tensor([ 6.,  0.,  2.,  1.,  2.,  4.,  1., 16.,  1.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
           0.,  1.,  1.,  6.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  7.,
           0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
           0.,  0.,  5.,  0.,  2.,  1.,  1.,  2.,  4., 20.,  0.,  0.,  0.,  1.,
           1.,  5.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  2.]),
  'y': tensor(1.)},
 {'id': tensor(107000410),
  'features': tensor([ 6.,  0.,  2.,  1.,  2.,  4.,  1., 16.,  1.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
           0.,  1.,  1.,  6.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  4.,
           0.,  0.,  0., 

In [2]:
def read_dask_parquet(path):
    dask_df_processed = dd.read_parquet(path)
    dask_df_processed_len = dask_df_processed.shape[0].compute()
    print(f"Number of rows train: {dask_df_processed_len}")
    return dask_df_processed

In [3]:
class ProcessedDataset(IterableDataset):
    def __init__(self, datadir, vocab, protein_map):
        self.chunks_path = [os.path.join(datadir, i) for i in os.listdir(datadir)]
        self.vocab = vocab
        self.protein_map = protein_map

    def __iter__(self):
        for chunk_path in self.chunks_path:
            chunk = dd.read_parquet(path=chunk_path, engine='pyarrow').compute()
            for _, row in chunk.iterrows():
                yield self.process_row(row)
            del chunk
            gc.collect()

    def process_row(self, row):
        features_list = [
            np.array(row['bb1']).flatten(),
            np.array(row['bb2']).flatten(),
            np.array(row['bb3']).flatten(),
            np.array(row['molecule']).flatten(),
            np.array(row['protein']).flatten()
        ]
        
        features_array = np.concatenate(features_list).astype(np.float32)
        features = torch.tensor(features_array, dtype=torch.float32)
    
        data = {
            'id': torch.tensor(row['id'], dtype=torch.long),
            'features': features,
            'y': torch.tensor(row['y'], dtype=torch.float32)
        }
        return data

def collate_fn(batch):
    ids = torch.stack([item['id'] for item in batch])
    features = torch.stack([item['features'] for item in batch])
    y = torch.stack([item['y'] for item in batch])
    return {'id': ids, 'features': features, 'y': y}

In [4]:
dataset = ProcessedDataset(datadir, vocab, protein_map)

In [1]:
# for i, row in tqdm(enumerate(dataset), total=train_len):
#     # if i == 1000:
#     #     break
#     pass

In [3]:
df = read_dask_parquet(datadir)

Number of rows train: 295246830


In [5]:
value_counts = df.y.value_counts().compute()

In [7]:
value_counts.to_numpy()

array([293656924,   1589906])

In [10]:
1589906/(293656924+1589906), 293656924/(293656924+1589906)

(0.005385006165857902, 0.9946149938341421)

In [4]:
df1 = df.loc[df['y'] == 1].compute()

: 

In [9]:
bb1 = df.bb1

In [4]:
bb1 = df.bb1.compute()

In [7]:
bb1.values.nbytes

2361974640

In [None]:
def process_row(row):
    features_list = [
        np.array(row['bb1']).flatten(),
        np.array(row['bb2']).flatten(),
        np.array(row['bb3']).flatten(),
        np.array(row['molecule']).flatten(),
        np.array(row['protein']).flatten()
    ]
    
    features_array = np.concatenate(features_list).astype(np.float32)
    features = torch.tensor(features_array, dtype=torch.float32)

    data = {
        'id': torch.tensor(row['id'], dtype=torch.long),
        'features': features,
        'y': torch.tensor(row['y'], dtype=torch.float32)
    }
    return data

chunk_paths = [os.path.join(datadir, i) for i in os.listdir(datadir)]
for chunk_path in tqdm(chunk_paths, total=len(chunk_paths)):
    dask_df = dd.read_parquet(path=chunk_path, engine='pyarrow')
    for batch in dask_df.partitions:
        chunk = batch.compute()
        for _, row in tqdm(chunk.iterrows()):
            process_row(row)
        del chunk
        gc.collect()

  0%|          | 0/296 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
import os
import psutil
import numpy as np
import dask.dataframe as dd
from tqdm.auto import tqdm

# Define function to check RAM usage
def is_ram_usage_high(threshold=80):
    return psutil.virtual_memory().percent >= threshold

chunk_paths = [os.path.join(datadir, i) for i in os.listdir(datadir)]
os.makedirs('/home/23m1521/ashish/kaggle/arrays_chunks/bb1', exist_ok=True)

bb1_all = []

for i, chunk_path in enumerate(tqdm(chunk_paths)):
    bb1_chunk = dd.read_parquet(path=chunk_path, engine='pyarrow').bb1.compute().values
    bb1_all.append(bb1_chunk)
    
    if is_ram_usage_high(threshold=80):
        concatenated_array = np.concatenate(bb1_all)
        np.savez_compressed(f"/home/23m1521/ashish/kaggle/arrays_chunks/bb1/bb1-part-{i+1}.npz", array1=concatenated_array)
        bb1_all = []
        print(f"Saved concatenated chunk at {i+1}, RAM usage: {psutil.virtual_memory().percent}%")

if bb1_all:
    concatenated_array = np.concatenate(bb1_all)
    np.savez_compressed(f"/home/23m1521/ashish/kaggle/arrays_chunks/bb1/bb1-final.npz", array1=concatenated_array)
    print(f"Saved final chunk, RAM usage: {psutil.virtual_memory().percent}%")

  0%|          | 0/296 [00:00<?, ?it/s]

In [None]:
chunk_paths = [os.path.join(datadir, i) for i in os.listdir(datadir)]
os.makedirs('/home/23m1521/ashish/kaggle/arrays_chunks/bb1', exist_ok=True)
for i, chunk_path in enumerate(tqdm(chunk_paths)):
    bb1_chunk = dd.read_parquet(path=chunk_path, engine='pyarrow').bb1.compute().values
    np.savez_compressed(f"/home/23m1521/ashish/kaggle/arrays_chunks/bb1/bb1-{i+1}.npz", array1=bb1_chunk)

  0%|          | 0/296 [00:00<?, ?it/s]

In [None]:
np.savez("bb1.npz", array1=bb1.values)

In [None]:
df.y.value_counts().compute()

In [5]:
from tqdm.auto import tqdm, trange
epochs = 10
n_train_batchs = 1000

with tqdm(total=n_train_batchs, desc='Training') as pbar1:
    for batch in range(n_train_batchs):
        pbar1.set_description(f"Training Batch: {batch}")
        pbar1.update(1)

Training:   0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
from tqdm.auto import tqdm, trange
epochs = 10
n_train_batchs = 1000

with tqdm(total=epochs) as pbar1:
    with tqdm(total=n_train_batchs) as pbar2:
        for epoch in range(epochs):
            for batch in range(n_train_batchs):
    
                pbar2.update(1)
                pbar1.set_description(f"Epoch: {epoch} | Batch: {batch}")
                pbar1.update(1)

In [None]:
from tqdm.auto import tqdm

data_dis_counts = {'0': 0, '1': 0}

with tqdm(total=n_train_batchs, desc="Processing") as pbar:
    for batch in train_dataloader:
        counts = batch['y'].unique(return_counts=True)[1]
        data_dis_counts['0'] += counts[0]
        data_dis_counts['1'] += counts[1] if len(counts) > 1 else 0
        
        # Calculate percentages
        total_counts = data_dis_counts['0'] + data_dis_counts['1']
        percent_0 = (data_dis_counts['0'] / total_counts) * 100 if total_counts > 0 else 0
        percent_1 = (data_dis_counts['1'] / total_counts) * 100 if total_counts > 0 else 0
        
        # Update progress bar
        pbar.set_description(f"0: {data_dis_counts['0']} ({percent_0:.2f}%) | 1: {data_dis_counts['1']} ({percent_1:.2f}%)")
        pbar.update(1)
        
        # Free memory
        del batch, counts
        gc.collect()


In [None]:
df = read_dask_parquet(datadir)

class_0_indices = df.index[df['y'] == 0].compute().values
class_1_indices = df.index[df['y'] == 1].compute().values

Number of rows train: 295246830


In [7]:
def balanced_batch_sampler(batch_size):
    num_class_1 = batch_size // 2
    num_class_0 = batch_size - num_class_1
    while True:
        yield (random.sample(class_0_indices, num_class_0) + 
               random.sample(class_1_indices, num_class_1))

In [None]:
dataset = ProcessedDataset(datadir, vocab, protein_map)
batch_size = int(1024)
if device == 'cuda':
    train_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, pin_memory=False, num_workers=10)
else:
    train_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, pin_memory=False, num_workers=10)

n_train_batchs = int(train_len/batch_size)
print(n_train_batchs)

c = 0
with tqdm(total=n_train_batchs, desc="Processing") as pbar:
    for batch in train_dataloader:
        c += 1
        
        pbar.set_description(f"Rows: {c*batch_size}")
        pbar.update(1)
        
        if c % 1000 == 0:
            del batch
            gc.collect()
    

# for batch in tqdm(train_dataloader, total=n_train_batchs):
    # pass
    # break
    # print(batch['y'])

288326


Processing:   0%|          | 0/288326 [00:00<?, ?it/s]

In [16]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 200),
            nn.ReLU(),
            nn.Linear(200, 200),
            nn.ReLU(),
            nn.Linear(200, 1),
            # nn.Sigmoid()
        )

    def forward(self, x):
        return self.fc(x)

In [17]:
def train_model(
    model, 
    train_loader, 
    val_loader, 
    criterion, 
    optimizer, 
    device=device, 
    num_epochs=10, 
    accumulation_steps=4, 
    checkpoint_path="checkpoint.pth"
):
    # Initialize tracking variables
    best_val_accuracy = 0.0
    metrics = {"train_loss": [], "train_accuracy": [], "val_loss": [], "val_accuracy": []}
    start_epoch = 0

    # Resume training if checkpoint exists
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state"])
        optimizer.load_state_dict(checkpoint["optimizer_state"])
        best_val_accuracy = checkpoint["best_val_accuracy"]
        start_epoch = checkpoint["epoch"] + 1
        print(f"Resuming training from epoch {start_epoch}...")

    for epoch in trange(start_epoch, num_epochs):
        # Training Phase
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        optimizer.zero_grad()
        batch_count = 0
        
        for batch_idx, (batch) in tqdm(enumerate(train_loader), total=n_train_batchs):
            inputs = batch['features'].to(device)
            labels = batch['y'].to(device)
            with autocast(device):
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, labels)
            total_loss += loss.item()
            batch_count += 1

            loss.backward()
            
            if (batch_idx + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            predictions = (outputs.sigmoid() > 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
        
        train_loss = total_loss / batch_count
        train_accuracy = correct / total

        # Validation Phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        val_batch_count = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, total=n_val_batchs):
                inputs = batch['features'].to(device)
                labels = batch['y'].to(device)
                with autocast(device):
                    outputs = model(inputs).squeeze()
                    loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_batch_count += 1
                
                predictions = (outputs.sigmoid() > 0.5).float()
                val_correct += (predictions == labels).sum().item()
                val_total += labels.size(0)
        
        val_loss /= val_batch_count
        val_accuracy = val_correct / val_total

        # Save metrics
        metrics["train_loss"].append(train_loss)
        metrics["train_accuracy"].append(train_accuracy)
        metrics["val_loss"].append(val_loss)
        metrics["val_accuracy"].append(val_accuracy)

        # Check if validation accuracy improves
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(
                {
                    "model_state": model.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "best_val_accuracy": best_val_accuracy,
                    "epoch": epoch,
                },
                checkpoint_path,
            )
            print(f"New best model saved at epoch {epoch+1}")

        # Print metrics
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

    return metrics

In [18]:
model = BinaryClassifier(97).to(device)

# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs.")
#     model = nn.DataParallel(model)

# # criterion = nn.BCELoss()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [19]:
# x = batch['features'].to(device)
# y = batch['y'].to(device)
# op = model(x).squeeze(1)
# print(op)
# print(y)
# loss = criterion(op, y)
# print(loss)

In [None]:
train_model(
    model=model, 
    train_loader=train_dataloader, 
    val_loader=train_dataloader, 
    criterion=criterion, 
    optimizer=optimizer,
    device=device,
    num_epochs=10,
    accumulation_steps=10
)

In [None]:
# 7244/7381204 [01:53<30:39:04, 66.83it/s]
# 136/230662 [01:04<28:19:49,  2.26it/s]
# 968/9226463 [00:19<36:34:45, 70.06it/s]

Test data

In [None]:
import pandas as pd
import numpy as np
import os
import dask
from multiprocessing.pool import Pool
# dask.config.set(pool=Pool(20))
import dask.dataframe as dd
from collections import Counter
import re
from tqdm.auto import tqdm, trange

import torch
from torch.utils.data import IterableDataset, DataLoader, Dataset
from torch.amp import autocast
import torch.nn as nn
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

train_len = 295246830
protein_map = {'BRD4': 1, 'HSA': 2, 'sEH': 3}
vocab = {'C': 6825082866, '#': 81527490, '@': 511451694, 'H': 456489972, '=': 1406606874, 'O': 2554179786, 'N': 2469595230, 'c': 12257477022, '-': 438483636, '.': 216945504, 'l': 491088828, 'B': 123330132, 'r': 121915914, 'n': 1997759694, 'D': 295246830, 'y': 295246830, 'o': 67918650, 's': 156618468, 'S': 90662574, 'F': 492710238, '+': 65206260, 'i': 1414026, '/': 11547096, 'I': 23972994}
# datadir = '/kaggle/input/leash-belka2-process-data-chunks'

In [16]:
test_df = pd.read_parquet('./test.parquet')

In [None]:
test_df

In [18]:
def make_counter(l):
    l = re.sub(r'[\d()\[\]{}]+', '', ''.join(l))
    return dict(Counter(l))

def allign_counter_to_vocab(counter, vocab):
    return {key: counter.get(key, 0) for key in vocab.keys()}

def make_features(df, vocab):
    id = df['id'].to_numpy()
    smiles = df.drop(columns=['id', 'protein_name', 'binds']).to_numpy()
    protein = df['protein_name'].to_numpy()
    y = df['binds'].to_numpy()

    df_features = {'id':[], 'bb1':[], 'bb2':[], 'bb3':[], 'molecule':[], 'protein':[], 'y':[]}
    for i in trange(len(id), desc='making features'):
        df_features['id'].append(id[i])

        counter = make_counter(smiles[i][0])
        df_features['bb1'].append(allign_counter_to_vocab(counter, vocab))

        counter = make_counter(smiles[i][1])
        df_features['bb2'].append(allign_counter_to_vocab(counter, vocab))

        counter = make_counter(smiles[i][2])
        df_features['bb3'].append(allign_counter_to_vocab(counter, vocab))

        counter = make_counter(smiles[i][3])
        df_features['molecule'].append(allign_counter_to_vocab(counter, vocab))

        df_features['protein'].append(protein[i])
        df_features['y'].append(y[i])

    return df_features

def check_df_allignment(dff_features, vocab):
    flag = True
    for i in trange(len(dff_features['bb1'])):
        if dff_features['bb1'][i].keys() != vocab.keys():
            print(dff_features['bb1'][i].keys())
            print(vocab.keys())
            flag = False
            break
    return flag


def df_vectors(dff_features, vocab, protein_map):
    op = np.empty((100,7))
    for i in trange(0,len(dff_features['id']),100, desc='Making vector df'):
        df = pd.DataFrame({
            'id': dff_features['id'][i:i+100],
            'bb1': dff_features['bb1'][i:i+100],
            'bb2': dff_features['bb2'][i:i+100],
            'bb3': dff_features['bb3'][i:i+100],
            'molecule': dff_features['molecule'][i:i+100],
            'protein': dff_features['protein'][i:i+100],
            'y': dff_features['y'][i:i+100]
        })

        df.bb1 = df.bb1.apply(lambda x: list(x.values()))
        df.bb2 = df.bb2.apply(lambda x: list(x.values()))
        df.bb3 = df.bb3.apply(lambda x: list(x.values()))
        df.molecule = df.molecule.apply(lambda x: list(x.values()))
        df.protein = df.protein.map(protein_map)

        op = np.concatenate((op, df.to_numpy()))

    return op[100:]


def process_row(row, protein_map=protein_map):
    return {
             'id': row['id'],
             'bb1': list(allign_counter_to_vocab(make_counter(row['buildingblock1_smiles']), vocab).values()),
             'bb2': list(allign_counter_to_vocab(make_counter(row['buildingblock2_smiles']), vocab).values()),
             'bb3': list(allign_counter_to_vocab(make_counter(row['buildingblock3_smiles']), vocab).values()),
             'molecule': list(allign_counter_to_vocab(make_counter(row['molecule_smiles']), vocab).values()),
             'protein': protein_map[row['protein_name']],
            #  'y': row['binds']
        }


class ParquetDataset(IterableDataset):
    def __init__(self, dask_df, vocab=vocab, protein_map=protein_map, transform=None):
        self.dask_df = dask_df
        self.vocab = vocab
        self.protein_map = protein_map
        self.transform = transform
        

    def __iter__(self):
        for _, row in self.dask_df.iterrows():
            yield self.process_row(row)

    def process_row(self, row):
        data = {
            'id': row['id'],
            'bb1': list(allign_counter_to_vocab(make_counter(row['buildingblock1_smiles']), self.vocab).values()),
            'bb2': list(allign_counter_to_vocab(make_counter(row['buildingblock2_smiles']), self.vocab).values()),
            'bb3': list(allign_counter_to_vocab(make_counter(row['buildingblock3_smiles']), self.vocab).values()),
            'molecule': list(allign_counter_to_vocab(make_counter(row['molecule_smiles']), self.vocab).values()),
            'protein': self.protein_map[row['protein_name']],
            # 'y': row['binds']
        }
        if self.transform:
            data = self.transform(data)
        return data

In [19]:
test_dataset = ParquetDataset(test_df)

In [None]:
for i, data in tqdm(enumerate(test_dataset), total=1674896):
    print(data)
    break

In [None]:
data