In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install transformers==2.5.1

In [None]:
data = pd.read_csv("../input/kaz-rus/data.csv")

In [None]:
data

In [None]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler

import time
import random
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()

from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

import re



In [None]:
train, test = train_test_split(data, train_size = 0.6)


In [None]:
MAX_LENGTH = 256

In [None]:
def onehot(size, target):
    vec = torch.zeros(size, dtype=torch.float32)
    vec[target] = 1.
    return vec

class DatasetRetriever(Dataset):

    def __init__(self, df):
        self.texts = df['text'].values
        self.labels = df['lang'].values
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

    def get_tokens(self, text):
        encoded = self.tokenizer.encode_plus(text, add_special_tokens=True,max_length=MAX_LENGTH,pad_to_max_length=True)
        return encoded['input_ids'], encoded['attention_mask']

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        target = onehot(2, label)
        tokens, attention_mask = self.get_tokens(text)
        tokens, attention_mask = torch.tensor(tokens), torch.tensor(attention_mask)

        return self.labels[idx], tokens, attention_mask

In [None]:
class NNModel(nn.Module):

    def __init__(self, backbone):
        super(NNModel, self).__init__()
        self.backbone = backbone
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(
            in_features=self.backbone.pooler.dense.out_features*2,
            out_features=2,
        )

    def forward(self, input_ids, attention_masks):
        bs, seq_length = input_ids.shape
        seq_x, _ = self.backbone(input_ids=input_ids, attention_mask=attention_masks)
        apool = torch.mean(seq_x, 1)
        mpool, _ = torch.max(seq_x, 1)
        x = torch.cat((apool, mpool), 1)
        x = self.dropout(x)
        return self.linear(x)


backbone = XLMRobertaModel(XLMRobertaConfig.from_pretrained("xlm-roberta-base"))

In [None]:
model = NNModel(backbone)

In [None]:
lr = 1e-5
epoch = 10
optim = torch.optim.AdamW(model.parameters(), lr=lr)
criteria = nn.CrossEntropyLoss()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def train_step(trainloader, model, e):
    model.train()
    model = model.to(device)
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    for step, (targets, inputs, attention_masks) in enumerate(trainloader):
        inputs = inputs.to(device)
        attention_masks = attention_masks.to(device)
        targets = targets.to(device)
        optim.zero_grad()
        output = model(inputs, attention_masks)
        loss = criteria(output, targets)
        correct_predictions += (output.argmax(1) == targets).type(torch.float).sum().item()
        total_predictions += len(targets)
        loss.backward()
        optim.step()
        if step % 50 == 0:
            print(f"step: {step} {correct_predictions / total_predictions}")
    print("Epoch = [{}], accuracy = [{}]".format(e, correct_predictions / total_predictions))

In [None]:
def eval(testloader, model):
    model.eval()
    model = model.to(device)
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    for step, (targets, inputs, attention_masks) in enumerate(testloader):
        with torch.no_grad():
            inputs = inputs.to(device)
            attention_masks = attention_masks.to(device)
            targets = targets.to(device)
            output = model(inputs, attention_masks)
            loss = criteria(output, targets)
            correct_predictions += (output.argmax(1) == targets).type(torch.float).sum().item()
            total_predictions += len(targets)
            if step % 50 == 0:
                print(f"step: {step} {correct_predictions / total_predictions}")
    print(f"Validation: {correct_predictions / total_predictions}")

In [None]:
train_set = DatasetRetriever(train)

In [None]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=16)

In [None]:
for i in range(epoch):
    train_step(train_loader, model, i)

In [None]:
from IPython.display import FileLink

FileLink(r'model.pt')

In [None]:
eval(test_loader, model)

In [None]:
PATH = "model.pt"

# Save
torch.save(model.state_dict(), PATH)