<a href="https://colab.research.google.com/github/anindabitm/Zindi_mental_health/blob/master/Zindi_mental_health_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is based on the wonderful notebook and video made by Abhishek Thakur and the NLP Albumenations idea in this kernel
https://www.kaggle.com/shonenkov/nlp-albumentations

In [None]:
!nvidia-smi

In [None]:
!pip install textblob

In [None]:
!pip install contractions

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re
import string
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob 
import random
from nltk import sent_tokenize
from tqdm import tqdm
from albumentations.core.transforms_interface import DualTransform, BasicTransform
from sklearn.model_selection import StratifiedKFold
import contractions

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
class NLPTransform(BasicTransform):
    """ Transform for nlp task."""
    
    @property
    def targets(self):
        return {"data": self.apply}
    
    def update_params(self, params, **kwargs):
        if hasattr(self, "interpolation"):
            params["interpolation"] = self.interpolation
        if hasattr(self, "fill_value"):
            params["fill_value"] = self.fill_value
        return params

    def get_sentences(self, text):
        return sent_tokenize(text)

In [None]:
class ShuffleSentencesTransform(NLPTransform):
    """ Do shuffle by sentence """
    def __init__(self, always_apply=False, p=0.5):
        super(ShuffleSentencesTransform, self).__init__(always_apply, p)

    def apply(self, data, **params):
        text = data
        sentences = self.get_sentences(text)
        random.shuffle(sentences)
        return ' '.join(sentences)

In [None]:
class ExcludeDuplicateSentencesTransform(NLPTransform):
    """ Exclude equal sentences """
    def __init__(self, always_apply=False, p=0.5):
        super(ExcludeDuplicateSentencesTransform, self).__init__(always_apply, p)

    def apply(self, data, **params):
        text = data
        sentences = []
        for sentence in self.get_sentences(text):
            sentence = sentence.strip()
            if sentence not in sentences:
                sentences.append(sentence)
        return ' '.join(sentences)

In [None]:
class ExcludeNumbersTransform(NLPTransform):
    """ exclude any numbers """
    def __init__(self, always_apply=False, p=0.5):
        super(ExcludeNumbersTransform, self).__init__(always_apply, p)

    def apply(self, data, **params):
        text = data
        text = re.sub(r'[0-9]', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text

In [None]:
class ExcludeHashtagsTransform(NLPTransform):
    """ Exclude any hashtags with # """
    def __init__(self, always_apply=False, p=0.5):
        super(ExcludeHashtagsTransform, self).__init__(always_apply, p)

    def apply(self, data, **params):
        text = data
        text = re.sub(r'#[\S]+\b', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text

In [None]:
class ExcludeUsersMentionedTransform(NLPTransform):
    """ Exclude @users """
    def __init__(self, always_apply=False, p=0.5):
        super(ExcludeUsersMentionedTransform, self).__init__(always_apply, p)

    def apply(self, data, **params):
        text = data
        text = re.sub(r'@[\S]+\b', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text

In [None]:
class ExcludeUrlsTransform(NLPTransform):
    """ Exclude urls """
    def __init__(self, always_apply=False, p=0.5):
        super(ExcludeUrlsTransform, self).__init__(always_apply, p)

    def apply(self, data, **params):
        text = data
        text = re.sub(r'https?\S+', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text

In [None]:
class SwapWordsTransform(NLPTransform):
    """ Swap words next to each other """
    def __init__(self, swap_distance=1, swap_probability=0.1, always_apply=False, p=0.5):
        """  
        swap_distance - distance for swapping words
        swap_probability - probability of swapping for one word
        """
        super(SwapWordsTransform, self).__init__(always_apply, p)
        self.swap_distance = swap_distance
        self.swap_probability = swap_probability
        self.swap_range_list = list(range(1, swap_distance+1))

    def apply(self, data, **params):
        text = data
        words = text.split()
        words_count = len(words)
        if words_count <= 1:
            return text

        new_words = {}
        for i in range(words_count):
            if random.random() > self.swap_probability:
                new_words[i] = words[i]
                continue
    
            if i < self.swap_distance:
                new_words[i] = words[i]
                continue
    
            swap_idx = i - random.choice(self.swap_range_list)
            new_words[i] = new_words[swap_idx]
            new_words[swap_idx] = words[i]

        return ' '.join([v for k, v in sorted(new_words.items(), key=lambda x: x[0])])

In [None]:
class CutOutWordsTransform(NLPTransform):
    """ Remove random words """
    def __init__(self, cutout_probability=0.05, always_apply=False, p=0.5):
        super(CutOutWordsTransform, self).__init__(always_apply, p)
        self.cutout_probability = cutout_probability

    def apply(self, data, **params):
        text = data
        words = text.split()
        words_count = len(words)
        if words_count <= 1:
            return text
        
        new_words = []
        for i in range(words_count):
            if random.random() < self.cutout_probability:
                continue
            new_words.append(words[i])

        if len(new_words) == 0:
            return words[random.randint(0, words_count-1)]

        return ' '.join(new_words)


In [None]:
import albumentations

def get_train_transforms():
    return albumentations.Compose([
        ExcludeDuplicateSentencesTransform(p=0.5),  # here not p=1.0 because your nets should get some difficulties
        ShuffleSentencesTransform(p=0.5),
        ExcludeNumbersTransform(p=0.5),
        ExcludeHashtagsTransform(p=0.5),
        ExcludeUsersMentionedTransform(p=0.5),
        ExcludeUrlsTransform(p=0.5),
        CutOutWordsTransform(p=0.5),
        SwapWordsTransform(p=0.5),
    ])

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = contractions.fix(text)
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words

def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    #tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    #tokenized_text = tokenizer.tokenize(nopunc)
    #remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    #combined_text = ' '.join(remove_stopwords)
    return nopunc

In [None]:
df=pd.read_csv('/kaggle/input/zindimentalhealth/Train.csv')
df.shape

In [None]:
df.head()

In [None]:
all_text = df.text.values
len(all_text)
for idx in range(len(all_text)):
    all_text[idx] = str(TextBlob(all_text[idx]).correct())

all_text[:5]

In [None]:
df['text']=all_text
df.head()

In [None]:
df['text'] = df['text'].apply(lambda x : text_preprocessing(x))

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['label'] = le.fit_transform(df['label'])
df.label.value_counts()

In [None]:
df.head(20)

In [None]:
df.isnull().sum()

# Lets try BERT !!!

In [None]:
#Configuration

import transformers
import tokenizers

MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
#BERT_PATH = "/kaggle/input/bert-pytorch/"
#BERT_PATH = "/kaggle/input/bert-base-uncased/"
#MODEL_PATH = "bert-large-uncased-pytorch_model.bin"
#TRAINING_FILE = "../input/imdb.csv"
TOKENIZER_BERT = transformers.BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

TOKENIZER_ROBERTA = transformers.RobertaTokenizer.from_pretrained(
    'roberta-base',
    lowercase=True,
    add_prefix_space=True,
)

In [None]:
import torch
import torch.nn.functional as F

class BERTDataset:
    def __init__(self,text,label,train_transforms=None):
        self.text = text
        self.label=label
        self.tokenizer = TOKENIZER_BERT
        self.max_len = MAX_LEN
        self.train_transforms = train_transforms
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())
        if self.train_transforms:
            text = self.train_transforms(data=text)['data']
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'label': torch.tensor(self.label[item], dtype=torch.long),
                       
        }

In [None]:
class ROBERTADataset:
    def __init__(self,text,label,train_transforms=None):
        self.text = text
        self.label=label
        self.tokenizer = TOKENIZER_ROBERTA
        self.max_len = MAX_LEN
        self.train_transforms = train_transforms
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())
        if self.train_transforms:
            text = self.train_transforms(data=text)['data']
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'label': torch.tensor(self.label[item], dtype=torch.long),
                       
        }

In [None]:
import torch.nn.functional as F
import torch.nn as nn

def linear_combination(x, y, epsilon): 
    return epsilon*x + (1-epsilon)*y

def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss


class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, epsilon:float=0.05, reduction='mean'):
        super().__init__()
        self.epsilon = epsilon
        self.reduction = reduction
    
    def forward(self, preds, target):
        n = preds.size()[-1]
        log_preds = F.log_softmax(preds, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return linear_combination(loss/n, nll, self.epsilon)

In [None]:
import torch
import torch.nn as nn
from tqdm.notebook import tqdm

device=torch.device('cuda')

loss_fn = nn.CrossEntropyLoss().to(device)
#loss_fn = LabelSmoothingCrossEntropy().to(device)



def train_bert(data_loader, model, optimizer, device, scheduler,n_example):
    model.train()
    losses=[]
    correct_predictions=0
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        label = d["label"]
        
        
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        label = label.to(device, dtype=torch.long)
        
        

        optimizer.zero_grad()
        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, label)
        correct_predictions += torch.sum(preds == label)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    return correct_predictions.double()/n_example,np.mean(losses)

                      


def eval_bert(data_loader, model, device,n_example):
    model.eval()
    losses=[]
    correct_predictions=0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            label = d["label"]
            

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            label = label.to(device, dtype=torch.long)
            

            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, label)
            correct_predictions += torch.sum(preds == label)
            losses.append(loss.item())
        
    return correct_predictions.double()/n_example,np.mean(losses)

In [None]:
def train_roberta(data_loader, model, optimizer, device, scheduler,n_example):
    model.train()
    losses=[]
    correct_predictions=0
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        
        mask = d["mask"]
        label = d["label"]
        
        
        ids = ids.to(device, dtype=torch.long)
        
        mask = mask.to(device, dtype=torch.long)
        label = label.to(device, dtype=torch.long)
        
        

        optimizer.zero_grad()
        outputs = model(
            ids=ids,
            mask=mask,
            
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, label)
        correct_predictions += torch.sum(preds == label)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    return correct_predictions.double()/n_example,np.mean(losses)

                      


def eval_roberta(data_loader, model, device,n_example):
    model.eval()
    losses=[]
    correct_predictions=0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            
            mask = d["mask"]
            label = d["label"]
            

            ids = ids.to(device, dtype=torch.long)
            
            mask = mask.to(device, dtype=torch.long)
            label = label.to(device, dtype=torch.long)
            

            outputs = model(
                ids=ids,
                mask=mask,
                
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, label)
            correct_predictions += torch.sum(preds == label)
            losses.append(loss.item())
        
    return correct_predictions.double()/n_example,np.mean(losses)

In [None]:

import transformers
import torch.nn as nn


class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        #self.bert = transformers.RobertaModel.from_pretrained('roberta-base')
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 4)
    
    def forward(self, ids, mask,token_type_ids):
        _, o2 = self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [None]:
class ROBERTABase(nn.Module):
    def __init__(self):
        super(ROBERTABase, self).__init__()
        self.roberta = transformers.RobertaModel.from_pretrained('roberta-base')
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 4)
    
    def forward(self, ids, mask):
        _, o2 = self.roberta(
            ids, 
            attention_mask=mask,
            )
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [None]:
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import gc

from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


df_test=pd.read_csv('/kaggle/input/zindimentalhealth/Test.csv')
df_test['label'] = 0
all_text = df_test.text.values
len(all_text)
for idx in range(len(all_text)):
    all_text[idx] = str(TextBlob(all_text[idx]).correct())
df_test['text'] = all_text
df_test['text'] = df_test['text'].apply(lambda x : text_preprocessing(x))


test_dataset = BERTDataset(
        text=df_test.text.values,
        label=df_test.label.values,
        train_transforms=None
    )

test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )
    

X = df['text']
y = df['label']

skf = StratifiedKFold(n_splits=10)
fold = 0
predictions_bert=np.zeros(shape=(309,4))

for train_index, test_index in skf.split(X, y):
    print('In fold #',fold+1)
    print("#"*80)
    df_train = df.iloc[train_index,:]
    df_valid = df.iloc[test_index,:]
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = BERTDataset(
        text=df_train.text.values,
        label=df_train.label.values,
        train_transforms=get_train_transforms()
        )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
        )

    valid_dataset = BERTDataset(
        text=df_valid.text.values,
        label=df_valid.label.values,
        train_transforms=None
        )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
        )

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=0,
                    num_training_steps=num_train_steps
                    )

    #model = nn.DataParallel(model)

    best_accuracy = 0
    logloss = 1.0
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 70)
        train_acc, train_loss = train_bert(train_data_loader, model, optimizer, device, scheduler,len(df_train))
        print(f'Train accuracy {train_acc} & Training loss {train_loss}')
        val_acc,val_loss = eval_bert(valid_data_loader, model, device,len(df_valid))
        print(f'Validation accuracy {val_acc} & Validation loss {val_loss}')
        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'pytorch_model_bert.bin')
            best_accuracy = val_acc
            print('Model saved!!!')
    
    model = BERTBaseUncased()
    model.load_state_dict(torch.load('pytorch_model_bert.bin'))
    model = model.to(device)
    model = model.eval()
    prediction_probs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(test_data_loader), total=len(test_data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            label = d["label"]
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            label = label.to(device, dtype=torch.long)
            outputs = model(ids=ids,
                            mask=mask,
                            token_type_ids=token_type_ids
                           )
            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim=1)
            prediction_probs.extend(probs)
        
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()
    predictions_bert += prediction_probs
    del model
    gc.collect()
    fold += 1


In [None]:
test_dataset = ROBERTADataset(
        text=df_test.text.values,
        label=df_test.label.values,
        train_transforms=None
    )

test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )
    

X = df['text']
y = df['label']

skf = StratifiedKFold(n_splits=10)
fold = 0
predictions_roberta=np.zeros(shape=(309,4))

for train_index, test_index in skf.split(X, y):
    print('In fold #',fold+1)
    print("#"*80)
    df_train = df.iloc[train_index,:]
    df_valid = df.iloc[test_index,:]
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = ROBERTADataset(
        text=df_train.text.values,
        label=df_train.label.values,
        train_transforms=get_train_transforms()
        )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
        )

    valid_dataset = ROBERTADataset(
        text=df_valid.text.values,
        label=df_valid.label.values,
        train_transforms=None
        )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
        )

    device = torch.device("cuda")
    model = ROBERTABase()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=0,
                    num_training_steps=num_train_steps
                    )

    #model = nn.DataParallel(model)

    best_accuracy = 0
    logloss = 1.0
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 70)
        train_acc, train_loss = train_roberta(train_data_loader, model, optimizer, device, scheduler,len(df_train))
        print(f'Train accuracy {train_acc} & Training loss {train_loss}')
        val_acc,val_loss = eval_roberta(valid_data_loader, model, device,len(df_valid))
        print(f'Validation accuracy {val_acc} & Validation loss {val_loss}')
        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'pytorch_model_roberta.bin')
            best_accuracy = val_acc
            print('Model saved!!!')
    
    model = ROBERTABase()
    model.load_state_dict(torch.load('pytorch_model_roberta.bin'))
    model = model.to(device)
    model = model.eval()
    prediction_probs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(test_data_loader), total=len(test_data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            label = d["label"]
            ids = ids.to(device, dtype=torch.long)
            
            mask = mask.to(device, dtype=torch.long)
            label = label.to(device, dtype=torch.long)
            outputs = model(ids=ids,
                            mask=mask,
                            
                           )
            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim=1)
            prediction_probs.extend(probs)
        
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()
    predictions_roberta += prediction_probs
    del model
    gc.collect()
    fold += 1

In [None]:
predictions = ((predictions_bert/10)*0.30 +(predictions_roberta/10)*0.70)

In [None]:
ss=pd.read_csv('/kaggle/input/zindimentalhealth/SampleSubmission.csv')
ss.head()

In [None]:
ss.shape

In [None]:
ss['Alcohol']=predictions[:,0]
ss['Depression']=predictions[:,1]
ss['Drugs']=predictions[:,2]
ss['Suicide']=predictions[:,3]

In [None]:
ss.head()

In [None]:
ss.to_csv('my_sub_roberta_bert_ensmbl.csv',index=False)