In [1]:
import nltk
from nltk.corpus import treebank
from nltk.corpus.reader.conll import *
import re
import numpy as np
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_data = "unamb_sent_14_6.conllu"
train_reader = ConllCorpusReader(root = '', fileids = [train_data], columntypes=['ignore', 'ignore', 'words', 'pos', 'chunk'])

In [3]:
sents = list(train_reader.iob_sents())
len(sents)*(3/4) 

28881.0

In [4]:
train_sents = sents[:28881]
test_sents = sents[28881: ]
print(len(train_sents))
print(len(test_sents))

28881
9627


In [5]:
X_train = [[word[0].lower()  for word in sent ] for sent in train_sents ]
X_test = [[word[0].lower()  for word in sent ] for sent in test_sents ]
Y_train = [[word[1]  for word in sent ] for sent in train_sents ]
Y_test = [[word[1]  for word in sent ] for sent in test_sents ]

#all words
words = list(set([word[0].lower() for sent in train_sents+test_sents for word in sent ]))

word2ind = {word: index for index, word in enumerate(words)}
ind2word = {index: word for index, word in enumerate(words)}

#all labels
labels = list(set([word[1] for sent in train_sents for word in sent ]))

label2ind = {label: (index + 1) for index, label in enumerate(labels)}
ind2label = {(index + 1): label for index, label in enumerate(labels)}

maxlen = max([len(sent) for sent in train_sents+test_sents])


In [6]:
print(len(labels))

14


In [7]:
#creating full sentences to fit them into tokenizer
train_sentences=[]
for i in X_train:
    s = ' '.join(w for w in i)
    train_sentences.append(s)

In [8]:
#add padding
train_sentences.append('pad')

In [9]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_sentences)
tokenizer

<keras.preprocessing.text.Tokenizer at 0x223cfebed68>

In [10]:
print(train_sentences[:10])

['« школа злословие » учить прикусить язык', 'сохраниться ли градус дискуссия в новый сезон ?', 'великолепный « школа злословие » вернуться в эфир после летний каникулы в новый формат .', 'в история программа это уже не первый « ребрендинг » .', 'писательница татьяна толстая и сценаристка дуня смирнова вроде бы не вполне соответствовать принятый на российский телевидение стандарт телеведущая .', 'в остальной « школа злословие » представлять себя интервью ведущий с герой выпуск .', 'иногда и в самый дело не без злословие , а по больший часть – разговор « с придыхание » , например в программа с участие борис берман и ильдар жандарев , с который чем далёкий , тем большой « родство » наблюдаться у ведущий « школа … » .', 'потом проект переехать с « культура » на нтв .', 'это помимо явный перемена в вид тут же появившийся рекламный блок , отсутствовавший на « культура » , позволить , с один сторона , расширить круг гость , с другой – изменить тон разговор .', 'набор герой программа расширит

In [11]:
sequences = tokenizer.texts_to_sequences(train_sentences)
print(sequences[:10])

[[5, 294, 4897, 6, 2284, 251], [3009, 4028, 2388, 1, 44, 684], [5, 294, 4897, 6, 496, 1, 1831, 62, 1328, 1, 44, 1193], [1, 120, 135, 38, 54, 7, 47, 5, 6], [4898, 1771, 2, 64, 7, 725, 1030, 955, 4, 52, 956, 1832], [1, 461, 5, 294, 4897, 6, 392, 65, 808, 4402, 8, 603, 1417], [497, 2, 1, 67, 76, 7, 85, 4897, 14, 11, 2001, 119, 94, 726, 5, 8, 6, 167, 1, 135, 8, 270, 1243, 2, 8, 13, 252, 654, 142, 5, 6, 1244, 32, 4402, 5, 294, 59, 6], [266, 123, 4029, 8, 5, 313, 6, 4], [38, 831, 3468, 2389, 1, 132, 203, 68, 4030, 1691, 2081, 4, 5, 313, 6, 1004, 8, 35, 114, 685, 1031, 8, 43, 94, 3710, 3227, 726], [2390, 603, 135, 67, 159, 99, 2, 5, 4897, 6, 46, 300]]


In [12]:
word_index = tokenizer.word_index

In [13]:
#has its own number now
pad_index= word_index.get('pad')

In [14]:
#remaking labels

def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result

Y_train_all=[]
for i in Y_train:
    for j in i:
        Y_train_all.append(j)
Y_train_toind=[label2ind[s] for s in Y_train_all]
Y_train_encode=[encode(int(i), 15) for i in Y_train_toind]

In [15]:
print(len(Y_train_encode))

350355


In [16]:
Y_test_all=[]
for i in Y_test:
    for j in i:
        Y_test_all.append(j)
Y_test_toind=[label2ind[s] for s in Y_test_all]
Y_test_encode=[encode(int(i), 15) for i in Y_test_toind]

In [17]:
print(len(Y_test_encode))

107228


In [18]:
embedding_dim=300
emb_path = 'wiki.ru.vec'

words = []

embeddings_index = {}
f = open(emb_path, encoding='utf-8')
for line in f:
    values = line.split()
    if len(values) == 301:
        word = values[0]
        words.append(word)
        coefs = np.asarray(values[1:(embedding_dim+1)], dtype='float32')
        embeddings_index[word] = coefs
f.close()


In [19]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [20]:
def get_window(word_list, k):
    windows = []
    hk = int((k-1)/2)
    for i in range(len(word_list)):
        if i<hk:
            window=[]
            j = 0
            while j<(hk-i):
                window.append('pad')
                j=j+1
            for ii in word_list[:i]:
                window.append(ii)
            for ii in word_list[i:(i+hk+1)]:
                window.append(ii)
            if len(window)<k:
                jj = len(window)
                while jj<k:
                    window.append('pad')
                    jj+=1
            windows.append(window)
        elif i>(len(word_list)-(hk+1)):
            window=[]
            for ii in word_list[(i-hk):i]:
                window.append(ii)
            for ii in word_list[i:]:
                window.append(ii)
            windows.append(window)
            if len(window)<k:
                jj = len(window)
                while jj<k:
                    window.append('pad')
                    jj+=1
        else:
            window=[]
            for ii in word_list[(i-hk):i]:
                window.append(ii)
            for ii in word_list[i:(i+hk+1)]:
                window.append(ii)
            windows.append(window)
    return windows


In [21]:
k=3

X_train_window=[get_window(v, k) for v in X_train]
X_train_windows =[]
for i in X_train_window:
    for j in i:
        X_train_windows.append(j)

X_train_encode=[]
for s in X_train_windows:
    l=[]
    for i in s:
        if word_index.get(i):
            l.append(word_index.get(i))
        else:
            l.append(int(pad_index))
    X_train_encode.append(l)

In [22]:
print(len(X_train_encode))

350355


In [23]:
X_test_window=[get_window(v, k) for v in X_test]
X_test_windows =[]
for i in X_test_window:
    for j in i:
        X_test_windows.append(j)

X_test_encode=[]
for s in X_test_windows:
    l=[]
    for i in s:
        if word_index.get(i):
            l.append(word_index.get(i))
        else:
            l.append(int(pad_index))
    X_test_encode.append(l)

In [24]:
print(len(X_test_encode))

107228


In [25]:
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding

In [26]:
Y_train_pad = pad_sequences(Y_train_encode, maxlen=15)
X_train_pad = pad_sequences(X_train_encode, maxlen=k)

In [27]:
Y_test_pad = pad_sequences(Y_test_encode, maxlen=15)
X_test_pad = pad_sequences(X_test_encode, maxlen=k)

In [28]:
batch_size = 32
nb_epoch = 6


model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=k,
                            trainable=False))
model.add(Flatten())
model.add(Dense(15, activation = 'softmax'))
#model.add(Dense(15))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_pad, Y_train_pad, epochs=nb_epoch, batch_size=batch_size,  validation_split=0.1)

Train on 315319 samples, validate on 35036 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x2245c898c88>

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3, 300)            10524600  
_________________________________________________________________
flatten_1 (Flatten)          (None, 900)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 15)                13515     
Total params: 10,538,115
Trainable params: 13,515
Non-trainable params: 10,524,600
_________________________________________________________________


In [30]:
score = model.evaluate(X_test_pad, Y_test_pad)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Test accuracy: 0.8902711978281349


In [31]:
batch_size = 32
nb_epoch = 6


model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=k,
                            trainable=False))
model.add(Flatten())
model.add(Dense(35, activation = 'softmax'))
model.add(Dense(15))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_pad, Y_train_pad, epochs=nb_epoch, batch_size=batch_size,  validation_split=0.1)

Train on 315319 samples, validate on 35036 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x2245ca4b240>

In [32]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 3, 300)            10524600  
_________________________________________________________________
flatten_2 (Flatten)          (None, 900)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 35)                31535     
_________________________________________________________________
dense_3 (Dense)              (None, 15)                540       
Total params: 10,556,675
Trainable params: 32,075
Non-trainable params: 10,524,600
_________________________________________________________________


In [33]:
score = model.evaluate(X_test_pad, Y_test_pad)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Test accuracy: 0.04050248069534077


In [34]:
batch_size = 32
nb_epoch = 10


model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=k,
                            trainable=False))
model.add(Flatten())
model.add(Dense(15, activation = 'relu'))
#model.add(Dense(15))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_pad, Y_train_pad, epochs=nb_epoch, batch_size=batch_size,  validation_split=0.1)

Train on 315319 samples, validate on 35036 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2245df39828>

In [35]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 3, 300)            10524600  
_________________________________________________________________
flatten_3 (Flatten)          (None, 900)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 15)                13515     
Total params: 10,538,115
Trainable params: 13,515
Non-trainable params: 10,524,600
_________________________________________________________________


In [36]:
score = model.evaluate(X_test_pad, Y_test_pad)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Test accuracy: 0.0
