In [1]:
CUDA_LAUNCH_BLOCKING="1"

In [2]:
import string
%matplotlib inline
import os
# os.listdir("../input/")
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
os.system('pip install pytorch_pretrained_bert --no-index --find-links="../input/pytorch-pretrained-bert/pytorch_pretrained_bert" ')
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
import random
import warnings
warnings.filterwarnings("ignore")

SEED = 42
BATCH_SIZE = 32
MAX_LENGTH = 256

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

In [3]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)


def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [4]:
# pip install pytorch_pretrained_bert

In [5]:
class BertForSequenceRegression(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForSequenceRegression, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dense = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids,  token_type_ids=None, attention_mask=None, targets=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        outputs = self.dense(pooled_output)
        return outputs.view(-1)

In [6]:
def RMSELoss(outputs, targets):
    return torch.sqrt(nn.MSELoss()(outputs, targets))

In [7]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

class regressor_stratified_cv:
    def __init__(self, n_splits = 10, n_repeats = 2, group_count = 10,
                 random_state = 0, strategy = 'quantile'):
        self.group_count = group_count
        self.strategy = strategy
        self.cvkwargs = dict(n_splits = n_splits, n_repeats = n_repeats, 
                             random_state = random_state)
        self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
        self.discretizer = KBinsDiscretizer(n_bins = self.group_count, encode = 'ordinal',
                                            strategy = self.strategy)  
            
    def split(self, X, y, groups = None):
        kgroups=self.discretizer.fit_transform(y[:, None])[:, 0]
        return self.cv.split(X, kgroups, groups)
    
    def get_n_splits(self, X, y, groups = None):
        return self.cv.get_n_splits(X, y, groups)

In [8]:
def text_preprocessing(excerpt):
    
    # lower casing
    excerpt = excerpt.lower()

    # removal of punctuation
    excerpt = excerpt.translate(str.maketrans('', '', string.punctuation))

        
    # removal of stopwords
    from nltk.corpus import stopwords
    ", ".join(stopwords.words('english'))
    STOPWORDS = set(stopwords.words('english'))
    excerpt = " ".join([word for word in str(excerpt).split() if word not in STOPWORDS])
        
    # lemmatization 
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    excerpt = " ".join([lemmatizer.lemmatize(word) for word in excerpt.split()])
        
                
    return excerpt

In [9]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [10]:
df["text"] = df["excerpt"].apply(lambda x: text_preprocessing(x))

In [11]:
excerpts = df.text.values
targets = df.target.values

In [12]:
BERT_FP = '../input/bert-base-uncased'

In [13]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained(BERT_FP, do_lower_case=True)

In [14]:
# convert sentences into tokens
input_ids = [tokenizer.encode(excerpt, add_special_tokens = True, max_length = MAX_LENGTH,
                              padding='max_length') for excerpt in excerpts]

input_ids = np.array(input_ids)
attention_masks = []
# create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
attention_masks = np.array(attention_masks)
# create token type ids
token_type_ids = [[0 for i in seq] for seq in input_ids]
token_type_ids = np.array(token_type_ids)

In [15]:
# n_splits = 5
# n_repeats = 2
# group_count = 10
# cv = regressor_stratified_cv(n_splits = n_splits, n_repeats = n_repeats,
#                            group_count = group_count, random_state = 0, strategy = 'quantile')

# for train_index, test_index in cv.split(input_ids, targets):
#     train_inputs, test_inputs = input_ids[train_index], input_ids[test_index]
#     train_targets, test_targets = targets[train_index], targets[test_index]
#     train_masks, test_masks = attention_masks[train_index], attention_masks[test_index]
#     train_type_ids, test_type_ids = token_type_ids[train_index], token_type_ids[test_index]

In [16]:
# train_inputs = torch.tensor(train_inputs, dtype=torch.long)
# test_inputs = torch.tensor(test_inputs, dtype=torch.long)
# train_targets = torch.tensor(train_targets, dtype=torch.float)
# test_targets = torch.tensor(test_targets, dtype=torch.float)
# train_masks = torch.tensor(train_masks, dtype=torch.long)
# test_masks = torch.tensor(test_masks, dtype=torch.long)
# train_type_ids = torch.tensor(train_type_ids, dtype=torch.long)
# test_type_ids = torch.tensor(test_type_ids, dtype=torch.long)

In [17]:
# train_data = TensorDataset(train_inputs, train_masks, train_type_ids, train_targets)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE)

# test_data = TensorDataset(test_inputs, test_masks, test_type_ids, test_targets)
# test_sampler = RandomSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = BATCH_SIZE)

In [18]:
# model = BertForSequenceRegression.from_pretrained(
#     BERT_FP, 
# )
# model.to(device)

In [19]:
# # Get all of the model's parameters as a list of tuples.
# params = list(model.named_parameters())

# print('The BERT model has {:} different named parameters.\n'.format(len(params)))

# print('==== Embedding Layer ====\n')

# for p in params[0:5]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# print('\n==== First Transformer ====\n')

# for p in params[5:21]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# print('\n==== Output Layer ====\n')

# for p in params[-4:]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [20]:
# optimizer = AdamW(model.parameters(),
#                   lr = 5e-4,
#                   eps = 1e-6 
#                 )

In [21]:
from transformers import get_linear_schedule_with_warmup

epochs = 3

# total_steps = len(train_dataloader) * epochs

# scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps)

In [22]:
# train only last layers
# set_trainable(model, True)
# set_trainable(model.bert.embeddings, False)
# set_trainable(model.bert.encoder, False)

In [23]:
n_splits = 5
n_repeats = 2
group_count = 10
cv = regressor_stratified_cv(n_splits = n_splits, n_repeats = n_repeats,
                           group_count = group_count, random_state = 0, strategy = 'quantile')
X = input_ids
y = targets
i = 0
for train_index, test_index in cv.split(X, y):
    print('======== Iter {:}  ========'.format(i))
    train_inputs, test_inputs = X[train_index], X[test_index]
    train_targets, test_targets = y[train_index], y[test_index]
    train_masks, test_masks = attention_masks[train_index], attention_masks[test_index]
    train_type_ids, test_type_ids = token_type_ids[train_index], token_type_ids[test_index]
    
    train_inputs = torch.tensor(train_inputs, dtype=torch.long)
    test_inputs = torch.tensor(test_inputs, dtype=torch.long)
    train_targets = torch.tensor(train_targets, dtype=torch.float)
    test_targets = torch.tensor(test_targets, dtype=torch.float)
    train_masks = torch.tensor(train_masks, dtype=torch.long)
    test_masks = torch.tensor(test_masks, dtype=torch.long)
    train_type_ids = torch.tensor(train_type_ids, dtype=torch.long)
    test_type_ids = torch.tensor(test_type_ids, dtype=torch.long)
    
    train_data = TensorDataset(train_inputs, train_masks, train_type_ids, train_targets)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE)

    test_data = TensorDataset(test_inputs, test_masks, test_type_ids, test_targets)
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = BATCH_SIZE)
    
    model = BertForSequenceRegression.from_pretrained(BERT_FP)
    model.to(device)
    set_trainable(model, True)
    set_trainable(model.bert.embeddings, False)
    set_trainable(model.bert.encoder, False)
    optimizer = AdamW(model.parameters(),
                  lr = 5e-4,
                  eps = 1e-6 
                )
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        # training
        model.train()
        tr_loss = []
        
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, type_ids, target = batch
            output = model(input_ids, input_mask, type_ids, target)
            loss = RMSELoss(output, target)
            tr_loss.append(loss.cpu().detach().numpy().tolist())
            loss.backward()  
            optimizer.step()
            optimizer.zero_grad()  
            scheduler.step()
            
        train_losses = np.mean(tr_loss)  
        print("Train loss: ", train_losses)
        all_targets, all_preds = [], []
        model.eval()   
        eval_loss = []
        # evaluation
        # disable gradients 
        with torch.no_grad(): 
            for batch in test_dataloader:
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, type_ids, target = batch
                output = model(input_ids, input_mask, type_ids, target)
                loss = RMSELoss(output, target)
            eval_loss.append(loss.cpu().detach().numpy().tolist())
            
        epoch_eval_loss = np.mean(eval_loss)
        print("Eval loss: ", epoch_eval_loss)
    i += 1    
    torch.cuda.empty_cache()



Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.7972164011337388
Eval loss:  0.6682190299034119

Train loss:  0.6967094733681477
Eval loss:  0.656562089920044

Train loss:  0.6625640123662814
Eval loss:  0.5700247287750244


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.7894523588704391
Eval loss:  0.7137970924377441

Train loss:  0.6785855305866456
Eval loss:  0.5812800526618958

Train loss:  0.6748593159964387
Eval loss:  0.8483160138130188


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.7707036523751809
Eval loss:  0.8255541324615479

Train loss:  0.6896791491709965
Eval loss:  0.6196920275688171

Train loss:  0.6628590687899523
Eval loss:  0.693461000919342


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8215976588323083
Eval loss:  0.7648919820785522

Train loss:  0.6786160494240236
Eval loss:  0.7503983974456787

Train loss:  0.6651355775309281
Eval loss:  0.7304264903068542


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.77788383188382
Eval loss:  0.9693148136138916

Train loss:  0.6895771647842837
Eval loss:  0.7932246327400208

Train loss:  0.6565069216237941
Eval loss:  0.5975861549377441


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.7841816275892123
Eval loss:  0.6953079104423523

Train loss:  0.6772546029426683
Eval loss:  0.526301383972168

Train loss:  0.6525075087245081
Eval loss:  0.8654531836509705


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.7955405846448012
Eval loss:  0.9006986021995544

Train loss:  0.6857070624828339
Eval loss:  0.8065899610519409

Train loss:  0.6549597865259144
Eval loss:  0.9306551814079285


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8343961826512512
Eval loss:  0.6409093141555786

Train loss:  0.6988520269662561
Eval loss:  0.5136734843254089

Train loss:  0.6647941642244097
Eval loss:  0.6631618738174438


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.7781297933887428
Eval loss:  0.722335934638977

Train loss:  0.6720603305689046
Eval loss:  0.6258094310760498

Train loss:  0.6681889337553105
Eval loss:  0.5139335989952087


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8155749431798156
Eval loss:  0.837369441986084

Train loss:  0.6980814635753632
Eval loss:  0.5661002397537231

Train loss:  0.6710099183337789
Eval loss:  0.8125994205474854


In [24]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [25]:
test["text"] = test["excerpt"].apply(lambda x: text_preprocessing(x))

In [26]:
excerpts = test.text.values

In [27]:
# convert sentences into tokens
input_ids = [tokenizer.encode(excerpt, add_special_tokens = True, max_length = MAX_LENGTH,
                              padding='max_length') for excerpt in excerpts]

input_ids = np.array(input_ids)
attention_masks = []
# create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
attention_masks = np.array(attention_masks)
# create token type ids
token_type_ids = [[0 for i in seq] for seq in input_ids]
token_type_ids = np.array(token_type_ids)

In [28]:
input_ids = torch.tensor(input_ids, dtype=torch.long)
attention_masks = torch.tensor(attention_masks, dtype=torch.long)
token_type_ids = torch.tensor(token_type_ids, dtype=torch.long)

In [29]:
prediction_data = TensorDataset(input_ids, attention_masks, token_type_ids)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)

In [30]:
model.eval()   
predictions , true_labels = [], []
# evaluation
for batch in prediction_dataloader:
    # disable gradients 
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, type_ids = batch
    with torch.no_grad():    
        output = model(input_ids, input_mask, type_ids) 
    output = output.cpu().detach().numpy().tolist()
    predictions += output

In [31]:
submission = pd.DataFrame({'id':test['id'],'target':predictions})

In [32]:
submission.to_csv('submission.csv',index=False)

In [33]:
torch.cuda.empty_cache()

In [34]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
# input_ids = []
# attention_masks = []
# token_type_ids = []

# for excerpt in excerpts:
#     encoded_dict = tokenizer.encode_plus(
#                         excerpt,                      
#                         add_special_tokens = True, 
#                         max_length = MAX_LENGTH,           
#                         padding = "max_length"
#                    )
    
       
#     input_ids.append(encoded_dict['input_ids'])
    
#     attention_masks.append(encoded_dict['attention_mask'])
    
#     token_type_ids.append(encoded_dict["token_type_ids"])