# Imports

In [15]:
import os
import sys
import zipfile
from string import punctuation

import seaborn as sns
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import gensim
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()


PROJECT_PATH = '/Users/alexanderbaranof/Documents/dailydialog-topic-recommendations/'
sys.path.append(os.path.join(PROJECT_PATH, 'src'))

from data.utils import download

# Constants

In [2]:
CORPUS_NAME = 'ijcnlp_dailydialog'
CORPUS_ZIP = 'ijcnlp_dailydialog.zip'

# Methods

In [146]:
def preprocessing(text: str, nlp) -> str:
    doc = nlp.make_doc(text)
    normalized_text = [word.text.strip(punctuation) for word in doc]
    normalized_text = [word.lower() for word in normalized_text]
    normalized_text = [word.replace(' ', '') for word in normalized_text]
    normalized_text = [word.replace('\t', '') for word in normalized_text]
    
    nlp.Defaults.stop_words.add('yes')
    nlp.Defaults.stop_words.add('like')
    nlp.Defaults.stop_words.add('know')
    nlp.Defaults.stop_words.add('think')
    nlp.Defaults.stop_words.add('right')
    nlp.Defaults.stop_words.add('oh')
    nlp.Defaults.stop_words.add('thank')
    nlp.Defaults.stop_words.add('good')
    nlp.Defaults.stop_words.add('ok')
    nlp.Defaults.stop_words.add('yeah')
    

    result_normalized_text = list()
    for word in normalized_text:
        if word not in nlp.Defaults.stop_words:
            result_normalized_text.append(word)
    result_normalized_text = ' '.join(result_normalized_text)
    
    return result_normalized_text.split()

In [185]:
def max_lda_topics(lda_topics):
    return pd.DataFrame(lda_topics).sort_values([1], ascending=False)[0].iloc[0]

# Checking data

In [147]:
# checking that directory exist 
corpus_dir = os.path.join(PROJECT_PATH, os.path.join('data/raw/', CORPUS_NAME))
print(corpus_dir)

corpus_zip_dir = os.path.join(PROJECT_PATH, os.path.join('data/raw/', CORPUS_ZIP))
print(corpus_zip_dir)

if os.path.exists(corpus_dir):
    print('Corpus is aviable!')
else:
    if os.path.exists(corpus_zip_dir):
        print('Only zip file aviable!')
        with zipfile.ZipFile(corpus_zip_dir, 'r') as zip_ref:
            zip_ref.extractall('../data/raw/')
        print('Unziped!')
    else:
        download(url='http://yanran.li/files/ijcnlp_dailydialog.zip',
                 file_name='../data/raw/ijcnlp_dailydialog.zip')
        with zipfile.ZipFile(corpus_zip_dir, 'r') as zip_ref:
            zip_ref.extractall('../data/raw/')
        print('Unziped!')

/Users/alexanderbaranof/Documents/dailydialog-topic-recommendations/data/raw/ijcnlp_dailydialog
/Users/alexanderbaranof/Documents/dailydialog-topic-recommendations/data/raw/ijcnlp_dailydialog.zip
Corpus is aviable!


# Lets build LDA model for select one topic

In [167]:
corpus = open(os.path.join(corpus_dir, 'dialogues_text.txt'), 'r')
corpus = corpus.read()
corpus = corpus.split('\n')

corpus = [dialog.split('__eou__') for dialog in corpus]

In [168]:
# load stop list for chunks phrases

bdf_nc = pd.read_csv('../data/processed/stoplist.csv', index_col=0)

In [169]:
corpus = [' '.join(doc) for doc in corpus]

In [170]:
corpus[0]

"The kitchen stinks .   I'll throw out the garbage .  "

In [171]:
nlp = spacy.load('en_core_web_sm') 

In [172]:
clear_corpus = list()
for doc in tqdm(corpus):
    clear_corpus.append(preprocessing(doc, nlp))

100%|██████████| 13119/13119 [00:04<00:00, 3235.35it/s]


In [173]:
clear_corpus[0]

['kitchen', 'stinks', 'll', 'throw', 'garbage']

In [174]:
# nice :)

In [175]:
ph = gensim.models.Phrases(clear_corpus, scoring='npmi', threshold=0.1)
#p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = ph[clear_corpus]

In [176]:
ngrammed_texts[0]

['kitchen', 'stinks', 'll_throw', 'garbage']

In [177]:
dictinary = gensim.corpora.Dictionary(ngrammed_texts)

In [178]:
dictinary.filter_extremes(no_above=0.85, no_below=10)
dictinary.compactify()

In [179]:
print(dictinary)

Dictionary(5333 unique tokens: ['garbage', 'kitchen', 'cigarette', 'coffee', 'come']...)


In [180]:
gensim_corpus = [dictinary.doc2bow(text) for text in ngrammed_texts]

In [181]:
len(gensim_corpus)

13119

In [182]:
lda = gensim.models.LdaMulticore(gensim_corpus, 5, id2word=dictinary, passes=10)

In [183]:
lda.print_topics()

[(0,
  '0.046*"s" + 0.008*"need" + 0.008*"going" + 0.008*"time" + 0.007*"want" + 0.007*"ll" + 0.005*"sure" + 0.005*"let_s" + 0.004*"m" + 0.004*"look"'),
 (1,
  '0.044*"’_s" + 0.020*"’_m" + 0.020*"t" + 0.015*"don_’" + 0.011*"’_ll" + 0.008*"s" + 0.008*"’" + 0.007*"’_ve" + 0.007*"’_t" + 0.007*"sure"'),
 (2,
  '0.035*"s" + 0.010*"sir" + 0.009*"want" + 0.008*"m" + 0.006*"ll" + 0.006*"sure" + 0.006*"thanks" + 0.004*"time" + 0.004*"d" + 0.004*"way"'),
 (3,
  '0.019*"s" + 0.008*"company" + 0.007*"work" + 0.007*"job" + 0.005*"people" + 0.005*"time" + 0.005*"course" + 0.004*"tell" + 0.004*"sure" + 0.004*"business"'),
 (4,
  '0.016*"s" + 0.011*"time" + 0.008*"’_s" + 0.008*"ll" + 0.006*"m" + 0.006*"t" + 0.006*"work" + 0.005*"morning" + 0.005*"sorry" + 0.004*"look"')]

Topic 3 is good. Lets use it.

In [189]:
labels = list()
for text in tqdm(gensim_corpus):
    labels.append(max_lda_topics(lda.get_document_topics(text, minimum_probability=0.0)))

100%|██████████| 13119/13119 [00:13<00:00, 948.94it/s] 


In [191]:
pd.Series(labels).value_counts()

0    3253
2    2947
1    2558
3    2242
4    2119
dtype: int64

In [192]:
result_data = pd.DataFrame()
result_data['corpus'] = corpus
result_data['label'] = labels

In [193]:
result_data

Unnamed: 0,corpus,label
0,The kitchen stinks . I'll throw out the garb...,0
1,"So Dick , how about getting some coffee for to...",1
2,Are things still going badly with your housegu...,1
3,"Would you mind waiting a while ? Well , how ...",4
4,Are you going to the annual party ? I can give...,0
...,...,...
13114,Lindsay Tipping gave me your name as a referen...,3
13115,The consignments of chemical instruments have ...,3
13116,good morning . How can I help you ? I'd like...,1
13117,May I ask you a few questions about insurance ...,1


In [195]:
result_data[result_data.label == 3].to_csv('../data/processed/lda_results.csv')