# Imports

In [17]:
import os
import sys
import zipfile
import copy

import seaborn as sns
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm

In [18]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


from keras.layers import Input, Dense, Flatten,Activation
from keras.layers import Conv1D, MaxPooling1D,Dropout,LSTM
from keras.layers import Embedding
from keras.models import Model

from numpy import zeros
from numpy import asarray

from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential

from keras.layers import Dense
from keras.layers import Flatten

from keras.layers.embeddings import Embedding
import tensorflow

In [2]:
PROJECT_PATH = '/Users/alexanderbaranof/Documents/dailydialog-topic-recommendations/'

In [3]:
sys.path.append(os.path.join(PROJECT_PATH, 'src'))

In [4]:
from data.utils import download

# Load data

In [13]:
df = pd.read_csv('../data/processed/lda_results.csv', index_col=0)

In [14]:
df.head(2)

Unnamed: 0,corpus,label
10,What kind of food do you like ? I like Chine...,3
69,"I passed all the tests , Mom . Well done !",3


In [11]:
# load stop list for chunks phrases

bdf_nc = pd.read_csv('../data/processed/stoplist.csv', index_col=0)

In [12]:
bdf_nc.head(2)

Unnamed: 0,index,word_count
0,i,52433
1,you,47824


The original task is "про топик рекомендейшон
Взять датасет daily dailogs (http://yanran.li/dailydialog.html), выделить в нем с помощью spacy noun chunks фразы-объекты, отфильтровать слишком частотные (не характеризующие тему диалога). Далее обучить модельку, которая предсказывает следующую сущность в диалоге, которую можно обсудить. В связи с большой вариативностью возможных сущностей, рекомендуется взять только часть диалогов по определенной теме (например, выделить с помощью ключевых слов)."

let's perform the decomposition of the problem
- cluster dialogs and highlight one topic - done
- to build a generative model for predicting the trace of an entity in a conversation

how do I understand the task:
there are several noun chunks in our text and I should to predict the following


на русском:
в нашем тексте есть несколько noun chunks и я должен предсказать следующую

# Build model

let's try to train the simplest model that will teach sequences of nouns from texts

let's extract all the sequences from the texts

In [20]:
parser = spacy.load('en_core_web_sm')

In [42]:
corpus_nc = dict()

for i, dialog in tqdm(enumerate(df.corpus.tolist())):
    document = parser(dialog)
    corpus_nc[i] = list()
    for nc in document.noun_chunks:
        corpus_nc[i].append(nc)
    
    corpus_nc[i] = [word.text for word in corpus_nc[i]]
    corpus_nc[i] = ['BEGIN'] + corpus_nc[i] + ['END']

2242it [00:43, 51.59it/s]


In [43]:
corpus_nc[0]

['BEGIN',
 'What kind',
 'food',
 'you',
 'I',
 'Chinese food',
 'your American',
 'We',
 'a lot',
 'Chinese restaurants',
 'America',
 'END']

In [44]:
max_len = 0
for sent in corpus_nc:
    if len(corpus_nc[sent]) > max_len:
        max_len = len(corpus_nc[sent])

In [45]:
max_len

228

In [46]:
new_sentences = list()

for sent in corpus_nc:
    new_sentences.append(corpus_nc[sent])

In [47]:
generated_sentences = list()
target_of_generated_sentences = list()
for i, sent in enumerate(new_sentences):
    for j in range(len(sent)-1):
        if j == 0:
            generated_sentences.append(sent[j])
        else:
            generated_sentences.append(sent[:j+1])
        target_of_generated_sentences.append(sent[j+1])

In [63]:
generated_sentences[:5]

['BEGIN',
 ['BEGIN', 'What kind'],
 ['BEGIN', 'What kind', 'food'],
 ['BEGIN', 'What kind', 'food', 'you'],
 ['BEGIN', 'What kind', 'food', 'you', 'I']]

In [49]:
# nice

In [50]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(generated_sentences)

vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index

In [51]:
encoded_docs = tokenizer.texts_to_sequences(generated_sentences)

In [57]:
encoded_docs[:5]

[[4], [4, 18], [4, 18, 148], [4, 18, 148, 2], [4, 18, 148, 2, 1]]

In [64]:
padded_docs = pad_sequences(encoded_docs, maxlen=max_len, padding='post')

In [65]:
len(padded_docs)

73806

In [66]:
le = LabelEncoder()
target_of_generated_sentences = le.fit_transform(target_of_generated_sentences)

In [67]:
different_labels = len(pd.unique(target_of_generated_sentences))

In [68]:
different_labels

18911

In [69]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 50, input_length=max_len)
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(1024, 10, activation='relu')(embedded_sequences)
x = MaxPooling1D(2)(x)
x = Conv1D(512, 5, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Conv1D(256, 3, activation='relu')(x)
x = MaxPooling1D(4)(x)
x = Flatten()(x)
x = Dense(different_labels*2, activation='relu')(x)
preds = Dense(different_labels, activation='softmax')(x)

model = Model(sequence_input, preds)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

model.fit(padded_docs, target_of_generated_sentences, epochs=500, verbose=1, batch_size=2048)


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 228)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 228, 50)           900900    
_________________________________________________________________
conv1d (Conv1D)              (None, 219, 1024)         513024    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 109, 1024)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 105, 512)          2621952   
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 52, 512)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 50, 256)           393472

KeyboardInterrupt: 