<a href="https://www.kaggle.com/code/anirudhg15/solving-a-finished-competition-seti-et-w-pytorch?scriptVersionId=135565958" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

from tqdm import tqdm
import pandas as pd
import numpy as np
import os, random, glob
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

import albumentations
from albumentations.pytorch.transforms import ToTensorV2

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import torch
import torchvision
import timm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

params = {
    'seed': 42,
    'model': 'swin_small_patch4_window7_224',
    'size' : 224,
    'inp_channels': 1,
    'device': device,
    'lr': 2e-5,
    'batch_size': 64,
    'num_workers' : 4,
    'epochs': 10,
    'out_features': 1,
    'num_tta': 10
}

In [None]:
def seed_everything(seed=42):
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(params['seed'])

In [None]:
train_dir = ('../input/seti-breakthrough-listen/train')
test_dir = ('../input/seti-breakthrough-listen/test')
train_df = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
test_df = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

In [None]:
def return_filepath(name, folder=train_dir):
    path = os.path.join(folder, name[0], f'{name}.npy')
    return path

In [None]:
train_df['image_path'] = train_df['id'].apply(lambda x: return_filepath(x))
test_df['image_path'] = test_df['id'].apply(lambda x: return_filepath(x, folder=test_dir))
train_df.head()

In [None]:
sns.countplot(x='target', data=train_df);
plt.ylabel("Count")
plt.xlabel("Target")
plt.title('Class distribution')

In [None]:
def get_train_transforms():
    return albumentations.Compose(
        [
            albumentations.Resize(params['size'],params['size']),
            albumentations.HorizontalFlip(p=0.5),
            albumentations.VerticalFlip(p=0.5),
            ToTensorV2(p=1.0)
        ]
    )

def get_valid_transforms():
    return albumentations.Compose(
    [
        albumentations.Resize(params['size'], params['size']),
        ToTensorV2(p=1.0)
    ])

In [None]:
class SETIDataset(Dataset):
    def __init__(self, images_filepaths, targets, transform=None):
        self.images_filepaths = images_filepaths
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.images_filepaths)

    def __getitem__(self, idx):
        image_filepath = self.images_filepaths[idx]
        image = np.load(image_filepath).astype(np.float32)
        image = np.vstack(image).transpose((1, 0))
            
        if self.transform is not None:
            image = self.transform(image=image)["image"]
        
        label = torch.tensor(self.targets[idx]).float()
        return image, label

In [None]:
(X_train, X_valid, y_train, y_valid) = train_test_split(train_df['image_path'],
                                                        train_df['target'],
                                                        test_size=0.2,
                                                        stratify=train_df['target'],
                                                        shuffle=True,
                                                        random_state=params['seed'])

In [None]:
train_dataset = SETIDataset(
    images_filepaths=X_train.values,
    targets=y_train.values,
    transform=get_train_transforms()
)

valid_dataset = SETIDataset(
    images_filepaths=X_valid.values,
    targets=y_valid.values,
    transform=get_valid_transforms()
)

In [None]:
class_counts = y_train.value_counts().to_list()
num_samples = sum(class_counts)
labels = y_train.to_list()

class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

In [None]:
train_loader = DataLoader(
    train_dataset, 
    batch_size=params['batch_size'], 
    sampler = sampler,
    num_workers=4, 
    pin_memory=True)

val_loader = DataLoader(
    valid_dataset, 
    batch_size=params['batch_size'], 
    shuffle=False,
    num_workers=4, 
    pin_memory=True)

In [None]:
class SETINet(nn.Module):
    def __init__(self, model_name=params['model'], 
                 out_features=params['out_features'],
                 inp_channels=params['inp_channels'], 
                 pretrained=True):
        
        super().__init__()
        
        self.model = timm.create_model(model_name, 
                                       pretrained=pretrained,
                                       in_chans=inp_channels)

        
        self.model.head = nn.Linear(self.model.head.in_features, 
                                    out_features, 
                                    bias=True)    
    
    def forward(self, x):
        return self.model(x)

In [None]:
def get_roc_score(output, target):
    
    y_pred = torch.sigmoid(output).cpu()

    return roc_auc_score(target.cpu(), y_pred.detach().numpy())

In [None]:
model = SETINet().to(params['device'])

criterion = nn.BCEWithLogitsLoss().to(params['device'])
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=params['lr']
                             )

In [None]:
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    
    batch_size = x.size()[0]
    idx = torch.randperm(batch_size).cuda()
    
    mixed_x = lam * x + (1-lam) * x[idx, :]
    y_a, y_b = y, y[idx]
    
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1-lam) * criterion(pred, y_b)

In [None]:
def train(dataloader, model, criterion, optimizer, epoch, params):
    
    model.train()
    
    scaler = torch.cuda.amp.GradScaler() # enable mixed precision training
    
    stream = tqdm(dataloader)
    
    train_loss = 0
    
    for i, (images, target) in enumerate(stream, start=1):

        images = images.to(params['device'], non_blocking=True)
        target = target.to(params['device'], non_blocking=True).float().view(-1, 1)
        images, targets_a, targets_b, lam = mixup_data(images, target.view(-1, 1))
        
        with torch.cuda.amp.autocast(): # wrapper for mixed precision training
            output = model(images).to(params['device'])
            loss = mixup_criterion(criterion, output, targets_a, targets_b, lam)
        
        train_loss += loss
        
        optimizer.zero_grad(set_to_none=True) 
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        #scaler.step(scheduler)
        scaler.update()
            
    train_loss /= len(dataloader)
        
    return train_loss
        

In [None]:
def validate(dataloader, model, criterion, epoch, params):
    
    model.eval()
    stream = tqdm(dataloader)
    
    eval_loss, eval_rocauc = 0,0
    
    with torch.no_grad():
        for i, (images, target) in enumerate(stream, start=1):
            
            images = images.to(params['device'], non_blocking=True)
            target = target.to(params['device'], non_blocking=True).float().view(-1, 1)
            
            output = model(images).to(params['device'])
            
            loss = criterion(output, target)
            eval_loss += loss
            eval_rocauc += get_roc_score(output, target)
    
    eval_loss /= len(dataloader)
    eval_rocauc /= len(dataloader)
    
    return eval_loss, eval_rocauc

In [None]:
torch.cuda.empty_cache()

In [None]:
import gc

gc.collect()

results = {'train_loss' : [],
              'eval_loss': [], 
              'eval_rocauc' : []}

for epoch in range(1, params['epochs'] + 1):
    
    train_loss = train(train_loader,
                                     model, 
                                     criterion, 
                                     optimizer, 
                                     epoch, 
                                     params)
    
    eval_loss, eval_rocauc = validate(val_loader, 
                                      model, 
                                      criterion, 
                                      epoch, 
                                      params)
    
    print(f'''Epoch:{epoch} | Train - Loss:{train_loss}
| Eval - Loss:{eval_loss}, AUROC:{eval_rocauc}''')
    
    results['train_loss'].append(train_loss)
    results['eval_loss'].append(eval_loss)
    results['eval_rocauc'].append(eval_rocauc)  

In [None]:
results

In [None]:
model.eval()
predicted_labels = None

for i in range(params['num_tta']):
    
    test_dataset = SETIDataset(
        images_filepaths = test_df['image_path'].values,
        targets = test_df['target'].values,
        transform = get_train_transforms()
    )
    
    test_loader = DataLoader(
        test_dataset, batch_size=params['batch_size'],
        shuffle=False, num_workers= 2,
        pin_memory=True
    )
    
    temp_preds = None
    
    with torch.no_grad():
        for (images, target) in tqdm(test_loader):
            images = images.to(params['device'], non_blocking=True)
            output = model(images)
            predictions = torch.sigmoid(output).cpu().numpy()
            
            if temp_preds is None:
                temp_preds = predictions
            else:
                temp_preds = np.vstack((temp_preds, predictions))
    
    if predicted_labels is None:
        predicted_labels = temp_preds
    else:
        predicted_labels += temp_preds
        
predicted_labels /= params['num_tta']

In [None]:
len(predicted_labels)

In [None]:
sub_df = pd.DataFrame()
sub_df['id'] = test_df['id']
sub_df['target'] = predicted_labels

In [None]:
sub_df.to_csv('submission.csv', index=False)