In [1]:
pip install gdown

Collecting gdown
  Downloading gdown-4.5.1.tar.gz (14 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-4.5.1-py3-none-any.whl size=14933 sha256=c5d87b827dcc89f40504b5eacfa78d0973529ae42a05cd3df6d14e2696cc9f44
  Stored in directory: /root/.cache/pip/wheels/3d/ec/b0/a96d1d126183f98570a785e6bf8789fca559853a9260e928e1
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-4.5.1
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import gdown
url = 'https://drive.google.com/uc?id=166yBhZLRkYDY-xqw4m0P4Ioc4GyjUVdI'
output = 'modelka.zip'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=166yBhZLRkYDY-xqw4m0P4Ioc4GyjUVdI
To: /kaggle/working/modelka.zip
100%|██████████| 521M/521M [00:05<00:00, 93.0MB/s] 


'modelka.zip'

In [3]:
!unzip modelka.zip

Archive:  modelka.zip
  inflating: config.json             
  inflating: pytorch_model.bin       


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler

import transformers
from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

import re

In [6]:
MAX_LEN = 224
path = "./"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
def onehot(size, target):
    vec = torch.zeros(size, dtype=torch.float32)
    vec[target] = 1.
    return vec

class DatasetRetriever(Dataset):

    def __init__(self, df):
        self.texts = df['text'].values
        self.labels = df['lang'].values
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

    def get_tokens(self, text):
        encoded = self.tokenizer.encode_plus(text, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True)
        return encoded['input_ids'], encoded['attention_mask']

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(int(self.labels[idx])).long()
        target = onehot(2, label)
        tokens, attention_mask = self.get_tokens(text)
        tokens, attention_mask = torch.tensor(tokens), torch.tensor(attention_mask)

        return target, tokens, attention_mask

In [8]:
class CustomRoberta(nn.Module):
    def __init__(self):
        super(CustomRoberta, self).__init__()
        self.num_labels = 2
        self.roberta = transformers.XLMRobertaModel.from_pretrained(path)
        self.dropout = nn.Dropout(p=0.3)
        self.linear = nn.Linear(
            in_features=self.roberta.pooler.dense.out_features*2,
            out_features=2,
        )
        

    def forward(self,
                input_ids=None,
                attention_mask=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None):

        o1, o2 = self.roberta(input_ids,
                               attention_mask=attention_mask,
                               position_ids=position_ids,
                               head_mask=head_mask,
                               inputs_embeds=inputs_embeds)
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        x = torch.cat((apool, mpool), 1)
        x = self.dropout(x)
        return self.linear(x)

In [9]:
model = CustomRoberta()

Some weights of the model checkpoint at ./ were not used when initializing XLMRobertaModel: ['linear.bias', 'linear.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
model.load_state_dict(torch.load("./pytorch_model.bin", map_location=device))

<All keys matched successfully>

In [11]:
def eval(testloader, model):
    model.eval()
    model = model.to(device)
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    for step, (targets, inputs, attention_masks) in enumerate(testloader):
        with torch.no_grad():
            inputs = inputs.to(device)
            attention_masks = attention_masks.to(device)
            targets = targets.to(device)
            output = model(inputs, attention_masks)
            correct_predictions += (output.argmax(1) == targets.argmax(1)).type(torch.float).sum().item()
            total_predictions += len(targets)
    print(f"Accuracy: {correct_predictions / total_predictions}")

In [12]:
data = pd.read_csv("../input/datalang/test_data.csv")
data['lang'] = np.where(data['lang'] == 'kz', 0, 1)

In [13]:
test_set = DatasetRetriever(data)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=True)

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [14]:
eval(test_loader, model)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Accuracy: 0.9994509265614276
