In [3]:
import time
import torch
import pickle
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torchtext.data import Field, BucketIterator, TabularDataset, Iterator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
torch.manual_seed(0) 
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False 
np.random.seed(0)

pd.set_option('display.max_colwidth', -1) 
pd.set_option("display.max_rows", 1000)

device = torch.device("cuda:0")
torch.cuda.set_device(0)

warnings.filterwarnings('ignore')

  


# ____________________

In [7]:
data = pd.read_csv('/content/drive/My Drive/data_train.csv')

In [8]:
data_test = pd.read_csv('/content/drive/My Drive/data_test_.csv')

In [9]:
X_train = data.FULLNAME.values
gender_train = data.GENDER.values
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,3))
X_train = vectorizer.fit_transform(X_train)
gender_model = LogisticRegression()
gender_model.fit(X_train, gender_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
X = vectorizer.transform(['Павлюк Екатерина Владимировна'])
gender_model.predict(X)[0]

'ж'

In [11]:
data.head()


Unnamed: 0,FULLNAME,NATION,GENDER
0,СТОПНИЦКИЙ БЕРЕК ГЕРШОВИЧ,ЕВРЕЙ,м
1,МИЛЛЕР МАРИЯ ЯКОВЛЕВНА,НЕМЕЦ,ж
2,FAIZOV FATTAH,ТАТАРИН,м
3,ЗАНИН МАКСИМ ЯКОВЛЕВИЧ,РУССКИЙ,м
4,МАЛЬЦЕВ ЯКОВ КОНДРАТЬЕВИЧ,РУССКИЙ,м


In [12]:
def split_fio(s):
    return s.replace(' ', '#').replace('', ' ')[1:-1]

In [13]:
data.FULLNAME = data.FULLNAME.apply(split_fio)

In [14]:
data.head()

Unnamed: 0,FULLNAME,NATION,GENDER
0,С Т О П Н И Ц К И Й # Б Е Р Е К # Г Е Р Ш О В И Ч,ЕВРЕЙ,м
1,М И Л Л Е Р # М А Р И Я # Я К О В Л Е В Н А,НЕМЕЦ,ж
2,F A I Z O V # F A T T A H #,ТАТАРИН,м
3,З А Н И Н # М А К С И М # Я К О В Л Е В И Ч,РУССКИЙ,м
4,М А Л Ь Ц Е В # Я К О В # К О Н Д Р А Т Ь Е В И Ч,РУССКИЙ,м


In [15]:
data = data[['FULLNAME', 'NATION']]

In [16]:
encoder = LabelEncoder()
data.NATION = encoder.fit_transform(data.NATION)

In [17]:
data.head()

Unnamed: 0,FULLNAME,NATION
0,С Т О П Н И Ц К И Й # Б Е Р Е К # Г Е Р Ш О В И Ч,50
1,М И Л Л Е Р # М А Р И Я # Я К О В Л Е В Н А,114
2,F A I Z O V # F A T T A H #,141
3,З А Н И Н # М А К С И М # Я К О В Л Е В И Ч,128
4,М А Л Ь Ц Е В # Я К О В # К О Н Д Р А Т Ь Е В И Ч,128


In [18]:
len(set(data.NATION))

188

In [19]:
data.head()

Unnamed: 0,FULLNAME,NATION
0,С Т О П Н И Ц К И Й # Б Е Р Е К # Г Е Р Ш О В И Ч,50
1,М И Л Л Е Р # М А Р И Я # Я К О В Л Е В Н А,114
2,F A I Z O V # F A T T A H #,141
3,З А Н И Н # М А К С И М # Я К О В Л Е В И Ч,128
4,М А Л Ь Ц Е В # Я К О В # К О Н Д Р А Т Ь Е В И Ч,128


In [None]:
with open('/content/drive/My Drive/label_enc.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [21]:
train, val = train_test_split(data, test_size=0.2, random_state=42)

In [22]:
train.to_csv('/content/drive/My Drive/train.csv', index=None)
val.to_csv('/content/drive/My Drive/test.csv', index=None)

In [23]:
data_test.FULLNAME = data_test.FULLNAME.apply(split_fio)

In [24]:
data_test.head()

Unnamed: 0,FULLNAME
0,L U K ' J A N O V A # I R I N A # I G N A T ' E V N A
1,B A R M I N # M I H A I L # A L E K S E E V I C H
2,Б Е Р Д О В С К И Й # А Н Т О Н # И В А Н О В И Ч
3,B A R A B A S H # S E R G E J # I V A N O V I C H
4,З У Д Е Р М А Н # Я К О В # Я К О В Л Е В И Ч


In [25]:
data_test.to_csv('/content/drive/My Drive/comp_test.csv', index=None)

In [26]:
tokenize = lambda x: x.split(' ')

In [27]:
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False, is_target=True)

In [28]:
nation_fields = [('FULLNAME', TEXT), ('NATION', LABEL)]

In [29]:
trn, vld = TabularDataset.splits(path='/content/drive/My Drive/',
                                 train='train.csv',
                                 validation="test.csv",
                                 format='csv',
                                 skip_header=True,
                                 fields=nation_fields)

In [30]:
TEXT.build_vocab(trn)

In [31]:
TEXT.vocab.freqs.most_common(10)

[('#', 1668122),
 ('a', 1098630),
 ('i', 1076177),
 ('и', 1075400),
 ('а', 1005061),
 ('v', 857512),
 ('в', 857228),
 ('o', 796645),
 ('о', 796621),
 ('e', 712919)]

In [33]:
TEXT.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'"': 72,
             '#': 2,
             '$': 71,
             "'": 44,
             '(': 64,
             ')': 63,
             '*': 73,
             ',': 75,
             '-': 56,
             '.': 60,
             '/': 68,
             '0': 67,
             '1': 77,
             '2': 76,
             '3': 78,
             '4': 79,
             '8': 74,
             '9': 80,
             ':': 65,
             '<': 69,
             '<pad>': 1,
             '<unk>': 0,
             '=': 81,
             '>': 70,
             '?': 62,
             'a': 3,
             'b': 42,
             'c': 23,
             'd': 31,
             'e': 11,
             'f': 40,
             'g': 35,
             'h': 15,
             'i': 4,
             'j': 28,
             'k': 22,
             'l': 19,
             'm': 29,
             'n': 14,
             'o': 9,
             'p': 38,
             'r': 16,
             's

In [34]:
batch_size = 256

In [35]:
train_iter, val_iter = BucketIterator.splits((trn, vld),
                                             batch_sizes=(batch_size, batch_size),
                                             device=device,
                                             sort_key=lambda x: len(x.FULLNAME),
                                             sort_within_batch=False,
                                             repeat=False)