# Stage 1: preprocessing

In [1]:
import gensim
import numpy as np
import pandas as pd

from rus_preprocessing_udpipe import *
from tqdm import tqdm_notebook as tqdm

In [2]:
train_df = pd.read_csv('train_sq.tsv', sep=r'\t', header=None, engine='python')
train_df.columns = ['ID', 'Title', 'Content', 'Target']
train_df.dropna(inplace=True)

# fix index
train_df.index = pd.Index(range(len(train_df)))

# get class targets
for i in range(100):
    train_df[f'Class {i}'] = train_df['Target'].apply(lambda classes: str(i) in classes.split(',')).astype(np.int8)
    

test_df = pd.read_csv('test_sq.tsv', sep=r'\t', header=None, engine='python')
test_df.columns = ['ID', 'Title', 'Content']
test_df.dropna(inplace=True)

# fix index
test_df.index = pd.Index(range(len(test_df)))

In [3]:
standard_library.install_aliases()

# URL of the UDPipe model
udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
udpipe_filename = udpipe_model_url.split('/')[-1]

if not os.path.isfile(udpipe_filename):
    print('UDPipe model not found. Downloading...', file=sys.stderr)
    wget.download(udpipe_model_url)

print('\nLoading the model...', file=sys.stderr)
udpipe_model = Model.load(udpipe_filename)
process_pipeline = Pipeline(udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')


Loading the model...


In [4]:
word2vec_model = gensim.models.KeyedVectors.load("model.model")

In [5]:
train_udp_titles = []
print('Processing train data...', file=sys.stderr)
for line in tqdm(train_df['Title'].values):
    res = unify_sym(line.strip())
    output = process(process_pipeline, text=res)
    train_udp_titles.append(output)

Processing train data...


HBox(children=(IntProgress(value=0, max=125931), HTML(value='')))




In [6]:
train_title_vec = []
for title in tqdm(train_udp_titles):
    words_vec = []
    for word in title:
        if word in word2vec_model:
            words_vec.append(word2vec_model.word_vec(word))
            
    train_title_vec.append(words_vec)

HBox(children=(IntProgress(value=0, max=125931), HTML(value='')))




# Stage 2: train model

In [7]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

SEED = 42

Using TensorFlow backend.


In [8]:
class F1(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0])))
        val_predict /= np.sum(val_predict, axis=1, keepdims=True)
        val_predict = np.where(val_predict > 0.1, 1, 0)
        val_targ = self.validation_data[1]
        
        _val_f1 = f1_score(val_targ, val_predict, average='samples')
        self.val_f1s.append(_val_f1)
        print ("— val_f1: {}".format(_val_f1))

        return
    
f1_metric = F1()
es = EarlyStopping()
mc = ModelCheckpoint('./model')

In [11]:
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(None, 300)))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(100, activation='sigmoid'))

model.compile(Adam(), loss='categorical_crossentropy')

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, None, 512)         1665024   
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 256)         787456    
_________________________________________________________________
lstm_6 (LSTM)                (None, None, 128)         197120    
_________________________________________________________________
lstm_7 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_5 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_6 (Dense)              (None, 100)               12900     
Total para

In [20]:
maxlen = 0
for ind, i in enumerate(train_title_vec):
    maxlen = max(maxlen, len(i))
    
X_train = train_title_vec
for ind, i in enumerate(X_train):
    if len(i) < maxlen:
        while len(X_train[ind]) < maxlen:
            X_train[ind].append([0 for _ in range(300)])
    else:
        if len(i) > maxlen:
            X_train[ind] = X_train[ind][:maxlen]
            
X_train = np.asarray(X_train)

In [21]:
y_train = train_df.loc[:, [f'Class {i}' for i in range(100)]].values

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  random_state=SEED,
                                                  test_size=0.1)

In [23]:
model.fit(X_train, y_train,
          validation_data=(X_val, y_val),
          callbacks=[f1_metric, es, mc],
          batch_size=128,
          epochs=30)

W1120 13:03:55.587251 4563666368 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W1120 13:04:43.145566 4563666368 deprecation_wrapper.py:119] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 113337 samples, validate on 12594 samples
Epoch 1/30


  'precision', 'predicted', average, warn_for)


— val_f1: 0.03559707192389422
Epoch 2/30
— val_f1: 0.15444518118410447
Epoch 3/30
— val_f1: 0.2881502445818787
Epoch 4/30
— val_f1: 0.3485215620756354
Epoch 5/30
— val_f1: 0.38038609662001754
Epoch 6/30
— val_f1: 0.40424775760649984
Epoch 7/30
— val_f1: 0.42008788416601667
Epoch 8/30
— val_f1: 0.42941841124308966
Epoch 9/30
— val_f1: 0.4341091667318442


<keras.callbacks.History at 0x204e3e2b0>

# Stage 3: make predictions

In [24]:
test_udp_titles = []
print('Processing input...', file=sys.stderr)
for line in tqdm(test_df['Title'].values):
    res = unify_sym(line.strip())
    output = process(process_pipeline, text=res)
    test_udp_titles.append(output)

Processing input...


HBox(children=(IntProgress(value=0, max=31345), HTML(value='')))




In [25]:
test_title_vec = []
for title in tqdm(test_udp_titles):
    words_vec = []
    for word in title:
        if word in word2vec_model:
            words_vec.append(word2vec_model.word_vec(word))
            
    test_title_vec.append(words_vec)

HBox(children=(IntProgress(value=0, max=31345), HTML(value='')))




In [26]:
X_test = test_title_vec
for ind, i in enumerate(X_test):
    if len(i) < maxlen:
        while len(X_test[ind]) < maxlen:
            X_test[ind].append([0 for _ in range(300)])
    else:
        if len(i) > maxlen:
            X_test[ind] = X_test[ind][:maxlen]
            
X_test = np.asarray(X_test)

In [27]:
def get_pred(preds):
    result = []
    for tmp in preds:
        tmp /= np.sum(tmp)
        tmp = [ind for ind, x in enumerate(tmp) if x > 0.1]
        if len(tmp) == 0:
            tmp = [0]
        tmp = ','.join(map(str, tmp))
        result.append(tmp)
        
    return result

ids = list(test_df['ID'].astype(np.int32))
pred = model.predict(X_test)
pred = get_pred(pred)

# fix missing ids
ids_set = set(ids)
for i in range(126048, 126048 + 31512):
    if i not in ids_set:
        ids.append(i)
        pred.append('0')
        
submit = pd.DataFrame({'ID': ids,
                       'pred': pred})
submit.to_csv('submit.tsv', header=None, index=None, sep='\t')

In [28]:
submit.head()

Unnamed: 0,ID,pred
0,126048,11116
1,126049,210
2,126050,4
3,126051,1869
4,126052,37
