<a href="https://colab.research.google.com/github/brooklinsantosh/HackLive-3-Guided-Hackathon---NLP/blob/master/DataHack3NLP_SimpleTransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPU check

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# Loading Libraries

In [None]:
!pip install simpletransformers

      Successfully uninstalled ipykernel-4.10.1
Successfully installed GitPython-3.1.11 base58-2.0.1 blinker-1.4 boto3-1.16.4 botocore-1.19.4 configparser-5.0.1 docker-pycreds-0.4.0 enum-compat-0.0.3 gitdb-4.0.5 ipykernel-5.3.4 jmespath-0.10.0 pathtools-0.1.2 pydeck-0.5.0b1 s3transfer-0.3.3 sacremoses-0.0.43 sentencepiece-0.1.94 sentry-sdk-0.19.1 seqeval-1.2.2 shortuuid-1.0.1 simpletransformers-0.48.14 smmap-3.0.4 streamlit-0.69.2 subprocess32-3.5.4 tensorboardx-2.1 tokenizers-0.9.2 tqdm-4.50.2 transformers-3.4.0 validators-0.18.1 wandb-0.10.8 watchdog-0.10.3


In [None]:
import numpy as np
import pandas as pd

from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score

import wandb



# Data

In [None]:
TARGET_COLS = ['Analysis of PDEs', 'Applications',
               'Artificial Intelligence', 'Astrophysics of Galaxies',
               'Computation and Language', 'Computer Vision and Pattern Recognition',
               'Cosmology and Nongalactic Astrophysics',
               'Data Structures and Algorithms', 'Differential Geometry',
               'Earth and Planetary Astrophysics', 'Fluid Dynamics',
               'Information Theory', 'Instrumentation and Methods for Astrophysics',
               'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
               'Optimization and Control', 'Representation Theory', 'Robotics',
               'Social and Information Networks', 'Statistics Theory',
               'Strongly Correlated Electrons', 'Superconductivity',
               'Systems and Control']

TOPIC_COLS = ['Computer Science', 'Mathematics', 'Physics', 'Statistics']

In [None]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission_Uqu2HVA.csv')

train['labels'] = list(zip(train['Analysis of PDEs'].tolist(), train['Applications'].tolist(), train['Artificial Intelligence'].tolist(), 
                           train['Astrophysics of Galaxies'].tolist(),  train['Computation and Language'].tolist(), train['Computer Vision and Pattern Recognition'].tolist(),
                           train['Cosmology and Nongalactic Astrophysics'].tolist(), train['Data Structures and Algorithms'].tolist(), train['Differential Geometry'].tolist(),
                           train['Earth and Planetary Astrophysics'].tolist(), train['Fluid Dynamics'].tolist(), train['Information Theory'].tolist(),
                           train['Instrumentation and Methods for Astrophysics'].tolist(), train['Machine Learning'], train['Materials Science'].tolist(),
                           train['Methodology'].tolist(), train['Number Theory'].tolist(), train['Optimization and Control'].tolist(),
                           train['Representation Theory'].tolist(), train['Robotics'].tolist(), train['Social and Information Networks'].tolist(),
                           train['Statistics Theory'].tolist(), train['Strongly Correlated Electrons'].tolist(), 
                           train['Superconductivity'].tolist(), train['Systems and Control'].tolist()))

topics = []
for i in range(train.shape[0]):
    topic = ''
    for j in TOPIC_COLS:
        if(train.iloc[i][j]==1):
            topic += (j + ' ')
    topics.append(topic)

train['text'] = topics + train['ABSTRACT']

trn, val = train_test_split(train, test_size=0.2, random_state=2)

leng =[]
for i in range(train.shape[0]):
    l = len((train['text'].iloc[i]).split(' '))
    leng.append(l)

train['len'] = leng

topics_test = []
for i in range(test.shape[0]):
    topic = ''
    for j in TOPIC_COLS:
        if(test.iloc[i][j]==1):
            topic += (j + ' ')
    topics_test.append(topic)

tst= pd.DataFrame((topics_test + test['ABSTRACT']))
tst.columns = ['text']

# Helper Methods

In [None]:
def get_best_thresholds(true, preds):
  thresholds = [i/100 for i in range(100)]
  best_thresholds = []
  for idx in range(25):
    f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1) for thresh in thresholds]
    best_thresh = thresholds[np.argmax(f1_scores)]
    best_thresholds.append(best_thresh)
  return best_thresholds

In [None]:
def download_preds(preds_test, file_name = 'hacklive_sub.csv'):
  ss[TARGET_COLS] = preds_test
  ss.to_csv(file_name, index = False)
  from google.colab import files
  files.download(file_name)

# Configs

In [None]:
model_args = MultiLabelClassificationArgs()

#model_args.use_early_stopping = False
#model_args.early_stopping_delta = 0.01
#model_args.early_stopping_metric = "eval_loss"
#model_args.early_stopping_metric_minimize = True
#model_args.early_stopping_patience = 3
#model_args.evaluate_during_training_steps = 1000

model_args.eval_batch_size = 16
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 1000
model_args.learning_rate = 0.00002177
model_args.manual_seed = 4
model_args.max_seq_length = 320
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.no_save = True
model_args.num_train_epochs = 16
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 8
model_args.gradient_accumulation_steps = 2
model_args.train_custom_parameters_only = False
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False

# Model Training

In [None]:
model = MultiLabelClassificationModel('roberta', 'roberta-base', use_cuda=True, num_labels=25, args=model_args)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'clas

In [None]:
model.train_model(trn,eval_df=val)

HBox(children=(FloatProgress(value=0.0, max=11203.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=6.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 6', max=351.0, style=ProgressStyle(des…






HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 6', max=351.0, style=ProgressStyle(des…






HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 6', max=351.0, style=ProgressStyle(des…






HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 6', max=351.0, style=ProgressStyle(des…






HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 6', max=351.0, style=ProgressStyle(des…






HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 6', max=351.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…







HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…





(1050,
 {'LRAP': [0.5204119642260323,
   0.7872954423500944,
   0.8561043547075311,
   0.8758990568121239,
   0.8835648490628084,
   0.8889730814685808,
   0.8899924285056316],
  'eval_loss': [0.17330125262114135,
   0.11765960485420444,
   0.094247703271156,
   0.08338556582616134,
   0.07878231443464756,
   0.07724176918749105,
   0.0764775238931179],
  'global_step': [175, 350, 525, 700, 875, 1000, 1050],
  'train_loss': [0.22723247110843658,
   0.14886417984962463,
   0.15509793162345886,
   0.11223648488521576,
   0.03757068142294884,
   0.06785576045513153,
   0.051411498337984085]})

# Evaluation & Best threshold

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(val)
print(result)

HBox(children=(FloatProgress(value=0.0, max=2801.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=88.0, style=ProgressStyle(descri…


{'LRAP': 0.810255241752497, 'eval_loss': 0.11467986807904461}


In [None]:
best_thresholds = get_best_thresholds(val[TARGET_COLS].values, model_outputs)

for i, thresh in enumerate(best_thresholds):
  model_outputs[:, i] = (model_outputs[:, i] > thresh) * 1
  
f1_score(val[TARGET_COLS], model_outputs, average='micro')

0.6959434313360625

# Prediction

In [None]:
predictions, raw_outputs = model.predict(tst['text'])

HBox(children=(FloatProgress(value=0.0, max=6002.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




KeyboardInterrupt: ignored

In [None]:
for i, thresh in enumerate(best_thresholds):
  raw_outputs[:, i] = (raw_outputs[:, i] > thresh) * 1

download_preds(raw_outputs, 'hacklive_roberta_large_3ep_threshold_optimized.csv')