In [1]:
!pip -q install transformers

[K     |████████████████████████████████| 778kB 2.7MB/s 
[K     |████████████████████████████████| 3.0MB 12.9MB/s 
[K     |████████████████████████████████| 890kB 31.2MB/s 
[K     |████████████████████████████████| 1.1MB 35.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import torch 
from transformers import RobertaTokenizer,RobertaModel,get_linear_schedule_with_warmup
import transformers 
import torch
import torch.nn as nn 
import pandas as pd 
import numpy as np 
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from sklearn.model_selection import StratifiedKFold
import os 
import random 

In [3]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [4]:
CONFIG = {
    'MAX_LEN':128,
    'TRAIN_BATCH_SIZE':16,
    'VALID_BATCH_SIZE':16,
    'EPOCHS':3,
    'TOKENIZER':RobertaTokenizer.from_pretrained('roberta-base',lowercase=True,truncation=True)
}

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [5]:
import random
from random import randint
import numpy as np
SEED_VAL  = 1000
# Set the seed value all over the place to make this reproducible.
def seed_all(SEED):
  random.seed(SEED_VAL)
  np.random.seed(SEED_VAL)
  torch.manual_seed(SEED_VAL)
  torch.cuda.manual_seed_all(SEED_VAL)
  os.environ['PYTHONHASHSEED'] = str(SEED_VAL)
  torch.backends.cudnn.deterministic = True

# Modeling

In [6]:
#Roberta Class 
class CustomRoberta(nn.Module):
    def __init__(self):
        super(CustomRoberta, self).__init__()
        self.num_labels = 4
        self.roberta = transformers.RobertaModel.from_pretrained("roberta-base", output_hidden_states=False, num_labels=self.num_labels)
        self.dropout = nn.Dropout(p=0.2)
        self.classifier = nn.Linear(768, self.num_labels)

    def forward(self,
                input_ids=None,
                attention_mask=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None):

        _, o2 = self.roberta(input_ids,
                               attention_mask=attention_mask,
                               position_ids=position_ids,
                               head_mask=head_mask,
                               inputs_embeds=inputs_embeds)
        o2 = self.dropout(o2)
        logits = self.classifier(o2)       
        outputs = logits
        return outputs

In [7]:
#Dataset 

class RobertaDataset:
  def __init__(self,tweet,target=None,task='train'):
    self.tweet= tweet
    self.target = target
    self.tokenizer = CONFIG['TOKENIZER']
    self.max_len = CONFIG['MAX_LEN']
    self.task = task
  
  def __len__(self):
    return len(self.tweet)

  def __getitem__(self,item):
    tweet = str(self.tweet[item])
    tweet = ' '.join(tweet.split())


    inputs = self.tokenizer.encode_plus(tweet,
                                        max_length=self.max_len,
                                        pad_to_max_length=True,
                                        add_special_tokens=True,
                                        truncation=True)
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    


    to_return= {
        'ids':torch.tensor(ids,dtype=torch.long),
        'mask':torch.tensor(mask,dtype=torch.long),
    }
    if (self.task=='train'):

      to_return.update({'target':torch.tensor(self.target[item])})

    return to_return 


In [8]:
def loss_fn(outputs,targets):
  criterion =  nn.CrossEntropyLoss()
  return criterion(outputs,targets)

In [9]:
#Train 
def train_fn(data_loader,model,optimizer,device,sc=None):
  model.train()
  tot_loss = 0
  for bi, d in enumerate(data_loader):
    ids = d['ids']
    mask = d['mask']
    targets = d['target']

    #send them to cuda gpu 
    ids = ids.to(device,dtype=torch.long)
    mask = mask.to(device,dtype=torch.long)
   
    targets = targets.to(device,dtype=torch.long)
    
    optimizer.zero_grad()

    outputs = model(
        ids,
        mask,
    )
    
    loss = loss_fn(outputs,targets)
    tot_loss += loss.item()
    loss.backward()
    optimizer.step()
    if sc:
      sc.step()
  
  print("Training loss for this epoch: ",tot_loss/len(data_loader))


In [10]:
#evaluation function 
def eval_fn(data_loader,model,device):
  model.eval()
  fin_targets = []
  fin_outputs =[]
  tot_loss = 0
  with torch.no_grad():
    for bi, d in enumerate(data_loader):
      ids = d['ids']
      mask = d['mask']
      
      targets = d['target']

      #send them to cuda gpu 
      ids = ids.to(device,dtype=torch.long)
      mask = mask.to(device,dtype=torch.long)
      
     
      targets = targets.to(device,dtype=torch.long)
      
      

      outputs = model(
          ids,
          mask
      )

      loss = loss_fn(outputs,targets)
      tot_loss+=loss.item()
      fin_targets.extend(targets.cpu().detach().numpy())
      fin_outputs.extend(torch.nn.functional.softmax(outputs).cpu().detach().numpy())
  return fin_outputs,fin_targets,tot_loss/(len(data_loader))

In [11]:
#preparing test data
test = pd.read_csv('test.csv')
test_dataset = RobertaDataset(
    tweet=test.text.values,
    task='test'
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=CONFIG['TRAIN_BATCH_SIZE'],
    num_workers=4
)

In [12]:
#function to predict on the test 
def predict_fn(model):
  fin_outputs = []
  with torch.no_grad():
    for bi, d in enumerate(test_data_loader):
      ids = d['ids']
      mask = d['mask']
  
      #send them to cuda gpu 
      ids = ids.to(device,dtype=torch.long)
      mask = mask.to(device,dtype=torch.long)
      
     
      outputs = model(
          ids,
          mask
      )
      fin_outputs.append(torch.nn.functional.softmax(outputs).cpu().detach().numpy())
      alls = np.vstack(fin_outputs)

  return alls

In [13]:
#function to run 5 folds and average their predictions on the test 

def run_folds():
    total_folds=5
    all_preds = []
    losses = []
    seed_all(SEED_VAL)
    dfx = pd.read_csv('train.csv').fillna("none")
    dfx['label'] = dfx['label'].factorize()[0]
    fold=StratifiedKFold(n_splits=total_folds, shuffle=True)
    for i,(train_index, test_index) in enumerate(fold.split(dfx,dfx['label'])):
      print(f'FOLD {i+1}/{total_folds}')
      df_train = dfx.iloc[train_index]
      df_valid = dfx.iloc[test_index]

      train_dataset =RobertaDataset(
          tweet=df_train.text.values,
          target=df_train.label.values,
          task='train'
      )

      train_data_loader = torch.utils.data.DataLoader(
          train_dataset,
          batch_size=CONFIG['TRAIN_BATCH_SIZE'],
          num_workers=4
      )

      valid_dataset =RobertaDataset(
          tweet=df_valid.text.values,
          target=df_valid.label.values,
          task='train'
      )

      valid_data_loader = torch.utils.data.DataLoader(
          valid_dataset,
          batch_size=CONFIG['TRAIN_BATCH_SIZE'],
          num_workers=1
      )

      device = torch.device("cuda")
      model = CustomRoberta()
      model.to(device)
      
      param_optimizer = list(model.named_parameters())
      no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
      optimizer_parameters = [
          {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
          {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
      ]

      num_train_steps = int(len(df_train) / CONFIG['TRAIN_BATCH_SIZE'] * CONFIG['EPOCHS'])
      optimizer = AdamW(optimizer_parameters, lr=5e-5)
      
      #scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=num_train_steps)


      best_accuracy = 0
      for epoch in range(CONFIG['EPOCHS']):
          print("----------------EPOCH "+str(epoch+1)+"---------------------")
          train_fn(train_data_loader, model, optimizer, device#scheduler
                  )
          outputs,targets,losss = eval_fn(valid_data_loader ,model, device)
          print("LOSS for this Epoc on val: ",losss)
      losses.append(losss)
      fold_preds = predict_fn(model)
      all_preds.append(fold_preds)
    print("mean losses over all folds: ",np.mean(losses))
    return  all_preds
       

In [14]:
preds = run_folds()

FOLD 1/5


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…


----------------EPOCH 1---------------------
Training loss for this epoch:  1.0138621868625763




LOSS for this Epoc on val:  0.6021507009863853
----------------EPOCH 2---------------------
Training loss for this epoch:  0.47367497124979574
LOSS for this Epoc on val:  0.3931288029998541
----------------EPOCH 3---------------------
Training loss for this epoch:  0.2686226394387983
LOSS for this Epoc on val:  0.503184124827385




FOLD 2/5
----------------EPOCH 1---------------------
Training loss for this epoch:  0.8687958352027401
LOSS for this Epoc on val:  0.4276110678911209
----------------EPOCH 2---------------------
Training loss for this epoch:  0.36442257151488333
LOSS for this Epoc on val:  0.31662369705736637
----------------EPOCH 3---------------------
Training loss for this epoch:  0.23132509810309257
LOSS for this Epoc on val:  0.360782029107213
FOLD 3/5
----------------EPOCH 1---------------------
Training loss for this epoch:  0.8311054447004872
LOSS for this Epoc on val:  0.55054035410285
----------------EPOCH 2---------------------
Training loss for this epoch:  0.37999659320039136
LOSS for this Epoc on val:  0.4617908578366041
----------------EPOCH 3---------------------
Training loss for this epoch:  0.24741336270686118
LOSS for this Epoc on val:  0.5367231331765652
FOLD 4/5
----------------EPOCH 1---------------------
Training loss for this epoch:  0.9351940578030001
LOSS for this Epoc on va

In [15]:
preds_1 = np.mean(preds,axis=0)

In [16]:
sub=pd.DataFrame()
sub['ID'] = test['ID']
sub['Depression'] = preds_1[:,0]
sub['Alcohol'] = preds_1[:,3]
sub['Suicide'] = preds_1[:,2]
sub['Drugs'] = preds_1[:,1]
sub.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.639994,0.087179,0.265698,0.007129
1,03BMGTOK,0.994833,0.000817,0.003632,0.000717
2,03LZVFM6,0.995323,0.000801,0.003121,0.000755
3,0EPULUM5,0.994547,0.000784,0.003973,0.000696
4,0GM4C5GD,0.004244,0.303252,0.006641,0.685863


In [17]:
sub.to_csv("Roberta_Winning_Solution.csv",index=False)