In [None]:
!pip install -q git+https://github.com/atamazian/kaggle_nlp_disaster_tweets

## Imports

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from kaggle_nlp_disaster_tweets import LitDataNLP, LitNLPModel

## Config

Don't forget to change ROOT_DIR to your data folder if you run this on Colab.

In [None]:
class CFG:
    ROOT_DIR = '../input/nlp-getting-started'
    BATCH_SIZE = 32
    MODEL_NAME = 'roberta-base'
    EPOCHS = 5
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

## Read data

In [None]:
df = pd.read_csv(os.path.join(CFG.ROOT_DIR, 'train.csv'))
train_df, valid_df = train_test_split(df, shuffle=True, random_state=42)
test_df = pd.read_csv(os.path.join(CFG.ROOT_DIR, 'test.csv'))

## Define dataset

In [None]:
dm = LitDataNLP(model_name=CFG.MODEL_NAME, 
                train_df=train_df,
                valid_df=valid_df,
                test_df=test_df,
                batch_size=CFG.BATCH_SIZE)

## Define and train model

In [None]:
chk_callback = ModelCheckpoint(
    monitor='val_f1_score',
    filename='model_best',
    save_top_k=1,
    mode='max',
)

model = LitNLPModel(CFG.MODEL_NAME, CFG.EPOCHS)

trainer = Trainer(
    gpus=1,
    max_epochs=CFG.EPOCHS,
    callbacks=[chk_callback]
)

trainer.fit(model, dm)

## Inference

In [None]:
test_dataloader = dm.test_dataloader()

print('Predicting labels...')

model.load_state_dict(torch.load(f'./lightning_logs/version_0/checkpoints/model_best.ckpt')['state_dict'])
model.eval()
model.to(CFG.DEVICE)

# Tracking variables 
predictions = []

# Predict 
for batch in tqdm(test_dataloader, total=len(test_dataloader)):
    batch = tuple(t.to(CFG.DEVICE) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(b_input_ids, b_input_mask, None)

    logits = outputs[0]

    logits = logits.detach().cpu().numpy()

    # Store predictions and true labels
    predictions.append(logits)

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

## Submission

In [None]:
sample_sub = pd.read_csv(os.path.join(CFG.ROOT_DIR, 'sample_submission.csv'))
sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':flat_predictions})
sub.to_csv('submission.csv',index=False)
sub.head()