In [2]:
import time
import torch
import pickle
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torchtext.data import Field, BucketIterator, TabularDataset, Iterator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
torch.manual_seed(0) 
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False 
np.random.seed(0)

pd.set_option('display.max_colwidth', -1) 
pd.set_option("display.max_rows", 1000)

device = torch.device("cuda:0")
torch.cuda.set_device(0)

warnings.filterwarnings('ignore')

  


# ____________________

In [5]:
data = pd.read_csv('/content/drive/My Drive/data_train.csv')

In [6]:
data_test = pd.read_csv('/content/drive/My Drive/data_test_.csv')

In [7]:
X_train = data.FULLNAME.values
gender_train = data.GENDER.values
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,3))
X_train = vectorizer.fit_transform(X_train)
gender_model = LogisticRegression()
gender_model.fit(X_train, gender_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
X = vectorizer.transform(['Павлюк Екатерина Владимировна'])
gender_model.predict(X)[0]

'ж'

In [9]:
data.head()


Unnamed: 0,FULLNAME,NATION,GENDER
0,СТОПНИЦКИЙ БЕРЕК ГЕРШОВИЧ,ЕВРЕЙ,м
1,МИЛЛЕР МАРИЯ ЯКОВЛЕВНА,НЕМЕЦ,ж
2,FAIZOV FATTAH,ТАТАРИН,м
3,ЗАНИН МАКСИМ ЯКОВЛЕВИЧ,РУССКИЙ,м
4,МАЛЬЦЕВ ЯКОВ КОНДРАТЬЕВИЧ,РУССКИЙ,м


In [10]:
def split_fio(s):
    return s.replace(' ', '#').replace('', ' ')[1:-1]

In [11]:
data.FULLNAME = data.FULLNAME.apply(split_fio)

In [12]:
data.head()

Unnamed: 0,FULLNAME,NATION,GENDER
0,С Т О П Н И Ц К И Й # Б Е Р Е К # Г Е Р Ш О В И Ч,ЕВРЕЙ,м
1,М И Л Л Е Р # М А Р И Я # Я К О В Л Е В Н А,НЕМЕЦ,ж
2,F A I Z O V # F A T T A H #,ТАТАРИН,м
3,З А Н И Н # М А К С И М # Я К О В Л Е В И Ч,РУССКИЙ,м
4,М А Л Ь Ц Е В # Я К О В # К О Н Д Р А Т Ь Е В И Ч,РУССКИЙ,м


In [13]:
data = data[['FULLNAME', 'NATION']]

In [14]:
encoder = LabelEncoder()
data.NATION = encoder.fit_transform(data.NATION)

In [15]:
data.head()

Unnamed: 0,FULLNAME,NATION
0,С Т О П Н И Ц К И Й # Б Е Р Е К # Г Е Р Ш О В И Ч,50
1,М И Л Л Е Р # М А Р И Я # Я К О В Л Е В Н А,114
2,F A I Z O V # F A T T A H #,141
3,З А Н И Н # М А К С И М # Я К О В Л Е В И Ч,128
4,М А Л Ь Ц Е В # Я К О В # К О Н Д Р А Т Ь Е В И Ч,128


In [16]:
len(set(data.NATION))

188

In [17]:
data.head()

Unnamed: 0,FULLNAME,NATION
0,С Т О П Н И Ц К И Й # Б Е Р Е К # Г Е Р Ш О В И Ч,50
1,М И Л Л Е Р # М А Р И Я # Я К О В Л Е В Н А,114
2,F A I Z O V # F A T T A H #,141
3,З А Н И Н # М А К С И М # Я К О В Л Е В И Ч,128
4,М А Л Ь Ц Е В # Я К О В # К О Н Д Р А Т Ь Е В И Ч,128


In [18]:
with open('/content/drive/My Drive/label_enc.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [19]:
train, val = train_test_split(data, test_size=0.2, random_state=42)

In [20]:
train.to_csv('/content/drive/My Drive/train.csv', index=None)
val.to_csv('/content/drive/My Drive/test.csv', index=None)

In [21]:
data_test.FULLNAME = data_test.FULLNAME.apply(split_fio)

In [22]:
data_test.head()

Unnamed: 0,FULLNAME
0,L U K ' J A N O V A # I R I N A # I G N A T ' E V N A
1,B A R M I N # M I H A I L # A L E K S E E V I C H
2,Б Е Р Д О В С К И Й # А Н Т О Н # И В А Н О В И Ч
3,B A R A B A S H # S E R G E J # I V A N O V I C H
4,З У Д Е Р М А Н # Я К О В # Я К О В Л Е В И Ч


In [23]:
data_test.to_csv('/content/drive/My Drive/comp_test.csv', index=None)

In [26]:
tokenize = lambda x: x.split(' ')

In [28]:
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False, is_target=True)

In [29]:
nation_fields = [('FULLNAME', TEXT), ('NATION', LABEL)]

In [31]:
trn, vld = TabularDataset.splits(path='/content/drive/My Drive/',
                                 train='train.csv',
                                 validation="test.csv",
                                 format='csv',
                                 skip_header=True,
                                 fields=nation_fields)

In [32]:
TEXT.build_vocab(trn)

In [33]:
TEXT.vocab.freqs.most_common(10)

[('#', 1668122),
 ('a', 1098630),
 ('i', 1076177),
 ('и', 1075400),
 ('а', 1005061),
 ('v', 857512),
 ('в', 857228),
 ('o', 796645),
 ('о', 796621),
 ('e', 712919)]

In [34]:
TEXT.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'"': 72,
             '#': 2,
             '$': 71,
             "'": 44,
             '(': 64,
             ')': 63,
             '*': 73,
             ',': 75,
             '-': 56,
             '.': 60,
             '/': 68,
             '0': 67,
             '1': 77,
             '2': 76,
             '3': 78,
             '4': 79,
             '8': 74,
             '9': 80,
             ':': 65,
             '<': 69,
             '<pad>': 1,
             '<unk>': 0,
             '=': 81,
             '>': 70,
             '?': 62,
             'a': 3,
             'b': 42,
             'c': 23,
             'd': 31,
             'e': 11,
             'f': 40,
             'g': 35,
             'h': 15,
             'i': 4,
             'j': 28,
             'k': 22,
             'l': 19,
             'm': 29,
             'n': 14,
             'o': 9,
             'p': 38,
             'r': 16,
             's

In [35]:
batch_size = 256

In [36]:
train_iter, val_iter = BucketIterator.splits((trn, vld),
                                             batch_sizes=(batch_size, batch_size),
                                             device=device,
                                             sort_key=lambda x: len(x.FULLNAME),
                                             sort_within_batch=False,
                                             repeat=False)

In [37]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_var):
        self.dl = dl
        self.x_var = x_var
        self.y_var = y_var
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var)
            y = getattr(batch, self.y_var)
            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [38]:
train_dl = BatchWrapper(train_iter, 'FULLNAME', 'NATION')
val_dl = BatchWrapper(val_iter, 'FULLNAME', 'NATION')

In [39]:
x, y = next(train_dl.__iter__())

In [40]:
x.shape

torch.Size([37, 256])

In [41]:
y.shape

torch.Size([256])

In [42]:
x.transpose(1, 0)[0]

tensor([47, 15,  3,  7,  9, 16,  9, 14, 22,  9,  7,  2,  3, 16, 22,  3, 31,  4,
        28,  2, 18, 11, 16, 35, 11, 11,  7,  4, 23, 15,  1,  1,  1,  1,  1,  1,
         1], device='cuda:0')

In [43]:
class GlobalMaxPooling(nn.Module):
    def __init__(self, dim=-1):
        super(self.__class__, self).__init__()
        self.dim = dim
        
    def forward(self, x):
        return x.max(dim=self.dim)[0]

In [44]:
class SimpleBiLSTM(nn.Module):
    def __init__(self, inp_dim, hidden_dim=700, emb_dim=300, output_dim=188):
        super().__init__()
        self.embedding = nn.Embedding(inp_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=1, bidirectional=True)
        self.pool = GlobalMaxPooling(dim=0) 
        self.dropout = nn.Dropout(0.25, inplace=True)
        self.linear = nn.Linear(2 * hidden_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=True)
        self.predictor = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, seq):
        # [seq_length, batch_size]
        x = self.embedding(seq) # [seq_length, batch_size, emb_dim]
        x, _ = self.lstm(self.embedding(seq)) # [seq_length, batch_size, num_dir * hid_size]
        x = self.pool(x) # [batch_size, num_dir * hid_size]
        x = self.dropout(x) # [batch_size, num_dir * hid_size]
        x = self.linear(x) # [batch_size, hid_size]
        x = self.relu(x) # [batch_size, hid_size]
        preds = self.predictor(x) # [batch_size, output_size]
        return preds

In [45]:
model = SimpleBiLSTM(inp_dim=len(TEXT.vocab))
model.cuda()

SimpleBiLSTM(
  (embedding): Embedding(82, 300)
  (lstm): LSTM(300, 700, bidirectional=True)
  (pool): GlobalMaxPooling()
  (dropout): Dropout(p=0.25, inplace=True)
  (linear): Linear(in_features=1400, out_features=700, bias=True)
  (relu): ReLU(inplace=True)
  (predictor): Linear(in_features=700, out_features=188, bias=True)
)

In [46]:
opt = optim.Adam(model.parameters(), lr=1e-4)
loss_func = nn.CrossEntropyLoss()
num_epochs = 200

In [None]:
train_loss = []
val_f1 = []
try:
    with open('/content/drive/My Drive/best_f1.pkl', 'rb') as f:
        best_f1 = pickle.load(f)
except:
    best_f1 = 0
iter_without_rise = 0

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    for x, y in tqdm_notebook(train_dl):
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        opt.zero_grad()
        train_loss.append(loss.cpu().data.numpy())
        
    model.eval()
    for x, y in tqdm_notebook(val_dl):
        preds = model(x).max(1)[1].data
        val_f1.append(f1_score(y.data.cpu().numpy(), preds.data.cpu().numpy(), average='macro'))
        
    curr_f1 = np.mean(val_f1[-len(val_dl):]) * 100

    print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time))
    print("training loss (in-iteration): \t{:.6f}".format(np.mean(train_loss[-len(train_dl):])))
    print("validation f-score: \t\t\t{:.2f} %".format(np.mean(val_f1[-len(val_dl):]) * 100))
    if curr_f1 > best_f1:
        best_f1 = curr_f1
        with open('/content/drive/My Drive/best_f1.pkl', 'wb') as f:
          pickle.dump(best_f1, f)
        torch.save(model.state_dict(), '/content/drive/My Drive/best_model.pt')
        iter_without_rise = 0
    else:
        iter_without_rise +=1
    if iter_without_rise >= 10:
        break

HBox(children=(FloatProgress(value=0.0, max=3241.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=811.0), HTML(value='')))


Epoch 1 of 200 took 697.800s
training loss (in-iteration): 	1.079810
validation f-score: 			26.38 %


HBox(children=(FloatProgress(value=0.0, max=3241.0), HTML(value='')))

KeyboardInterrupt: ignored

In [47]:
model.load_state_dict(torch.load('/content/drive/My Drive/best_model.pt'))
model.eval()

SimpleBiLSTM(
  (embedding): Embedding(82, 300)
  (lstm): LSTM(300, 700, bidirectional=True)
  (pool): GlobalMaxPooling()
  (dropout): Dropout(p=0.25, inplace=True)
  (linear): Linear(in_features=1400, out_features=700, bias=True)
  (relu): ReLU(inplace=True)
  (predictor): Linear(in_features=700, out_features=188, bias=True)
)

In [48]:
with open('/content/drive/My Drive/label_enc.pkl', 'rb') as f:
    encoder = pickle.load(f)

In [49]:
def predict(name, model=model):
  X = vectorizer.transform([name])
  name = name.replace(' ', '#').replace('', ' ').lower()[1: -1]
  name = [TEXT.tokenize(name)]
  inp = TEXT.numericalize(TEXT.pad(name)).cuda()
  preds = model(inp)
  pred = preds.max(1)[1].data
  gender_pred = gender_model.predict(X)[0]
  return encoder.inverse_transform(pred.data.cpu().numpy())[0], gender_pred

In [50]:
predict('Масаков Фаддей Андреевич')

('РУССКИЙ', 'м')

In [51]:
test = TabularDataset(path='/content/drive/My Drive/comp_test.csv',
                      format='csv',
                      skip_header=True,
                      fields=[('FULLNAME', TEXT)])

In [52]:
test_iter = Iterator(test,
                     batch_size=1024,
                     sort=False,
                     shuffle=False,
                     device=device,
                     train=False,
                     repeat=False)

In [53]:
x = next(test_iter.__iter__())

In [54]:
x.FULLNAME

tensor([[19, 42, 41,  ..., 25,  3, 24],
        [33,  3, 12,  ..., 37, 31,  6],
        [22, 16, 17,  ..., 32,  4, 24],
        ...,
        [ 1,  1,  1,  ...,  1,  1,  1],
        [ 1,  1,  1,  ...,  1,  1,  1],
        [ 1,  1,  1,  ...,  1,  1,  1]], device='cuda:0')

In [55]:
def get_predict():
  predicts = list()
  for x in test_iter:
     x = x.FULLNAME
     preds = model(x).max(1)[1].data
     predicts.extend(list(preds.data.cpu().numpy()))
  return predicts

In [56]:
predict = get_predict()

In [57]:
test = pd.read_csv('/content/drive/My Drive/data_test_.csv')

In [58]:
test['NATION'] = encoder.inverse_transform(predict)

In [59]:
test.head(20)

Unnamed: 0,FULLNAME,NATION
0,LUK'JANOVA IRINA IGNAT'EVNA,РУССКИЙ
1,BARMIN MIHAIL ALEKSEEVICH,РУССКИЙ
2,БЕРДОВСКИЙ АНТОН ИВАНОВИЧ,ПОЛЯК
3,BARABASH SERGEJ IVANOVICH,РУССКИЙ
4,ЗУДЕРМАН ЯКОВ ЯКОВЛЕВИЧ,НЕМЕЦ
5,СВИНЦОВА АННА НИКОЛАЕВНА,РУССКИЙ
6,BARANOVSKAJA MARIJA JUZEFOVNA,ПОЛЯК
7,МАТВЕЕВ АЛЕКСЕЙ ВАСИЛЬЕВИЧ,РУССКИЙ
8,GRISHIN FEDOR FILIPPOVICH,РУССКИЙ
9,JABLONSKAJA MARTSELINA IOSIFOVNA,ПОЛЯК


In [61]:
test.to_csv('/content/drive/My Drive/submission.csv')