In [8]:
import os
import torch
import torch.nn as nn
from torchvision import transforms
from torch.nn import functional as F
# from efficientnet_pytorch.model import EfficientNet
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from PIL import Image
import transformers
device = 'cuda'

In [3]:
test_data_path = './test.csv'
df_test = pd.read_csv(test_data_path)
df_test.head()

Unnamed: 0,text
0,The Principality of Erfurt (; ) was a small st...
1,The name Cumania originated as the Latin exony...
2,Distracted driving refers to the act of drivin...
3,"Al-Sisiniyah (, also spelled Sisnyeh) is a tow..."
4,"Onan Orlando Thom (born April 11, 1984) is a G..."


In [6]:
text_lst = df_test['text'].values.tolist()
# text_lst

In [33]:
MAX_LEN = 160
TOKENIZER = transformers.DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
TEST_BATCH_SIZE = 16

In [34]:
class TestDataLoader:
    def __init__(self, text):
        self.text = text
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
        }

In [35]:
test_dataset = TestDataLoader(
    text=df_test.text.values
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=TEST_BATCH_SIZE, num_workers=1
)

In [36]:
def predict(data_loader, model, device):
    model.eval()
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            # token_type_ids = d["token_type_ids"]
            mask = d["mask"]

            ids = ids.to(device, dtype=torch.long)
            # token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask,
                            # token_type_ids=token_type_ids
                            )
            outputs = outputs[0]
            fin_outputs.extend(torch.sigmoid(
                outputs).cpu().detach().numpy().tolist())
    return fin_outputs

In [37]:
model = transformers.DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.classifier = nn.Linear(768, 1)

model.load_state_dict(torch.load('./src/model.bin'))

<All keys matched successfully>

In [38]:
import logging
logging.basicConfig(level=logging.ERROR)

model.to(device)
pred = predict(test_data_loader, model, device)

100%|██████████| 18750/18750 [29:58<00:00, 10.43it/s]


In [57]:
final_tensor = torch.Tensor(pred)
final_tensor = final_tensor.view(1,-1).squeeze(0)

In [58]:
final_tensor

tensor([9.5609e-07, 1.0000e+00, 5.7217e-07,  ..., 9.1127e-07, 1.0000e+00,
        1.0000e+00])

In [61]:
final_pred = (final_tensor>0.5).int().numpy()
final_pred

array([0, 1, 0, ..., 0, 1, 1], dtype=int32)

In [69]:
submission = pd.DataFrame({'text': text_lst, 'label': final_pred})

In [70]:
submission.label = submission.label.apply(lambda x: "unscrambled" if x == 1 else "scrambled")

In [71]:
submission.head()

Unnamed: 0,text,label
0,The Principality of Erfurt (; ) was a small st...,scrambled
1,The name Cumania originated as the Latin exony...,unscrambled
2,Distracted driving refers to the act of drivin...,scrambled
3,"Al-Sisiniyah (, also spelled Sisnyeh) is a tow...",scrambled
4,"Onan Orlando Thom (born April 11, 1984) is a G...",scrambled


In [72]:
submission.to_csv('submission.csv', index=False)