### Requests before dialogs

In [1]:
import glob, os
import re
import functools
import pickle

import pandas as pd
import numpy as np

import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models import Phrases

import natasha
from natasha import NamesExtractor

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/alisa.khoroshavina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alisa.khoroshavina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def detectLang(string): # TODO: remove ','
    stringRu = "".join(re.sub('([A-Za-z]*)', ' ', string).split())
    stringEn = "".join(re.sub('([А-Яа-я]*)', ' ', string).split())
    if stringRu >= stringEn:
        return 'RU'
    else:
        return 'EN'

In [3]:
#lemmatization
def lemm(word):
    if detectLang(word) == 'EN':
        return nltk.stem.WordNetLemmatizer().lemmatize(word.lower())
    else:
        return morph.parse(word.lower())[0].normal_form

In [4]:
def extractNames(text):
    extractor = NamesExtractor()
    matches = extractor(text)
    names = []
    for match in matches:
        start, stop = match.span
        names += [lemm(name) for name in text[start:stop].split()]
    names = list(set(names)) # remove duplicates
    return names

In [5]:
def removeNum(string):
    return ''.join(re.sub('[0-9]*', '', string))

In [6]:
def nouns(word):
    p = morph.parse(word)[0]
    if 'NOUN' in p.tag:
        return True
    else:
        return False

In [7]:
noise = stopwords.words('russian') + stopwords.words('english')

with open('/home/alisa.khoroshavina/sw.txt', 'r', encoding='utf-8') as f:
    sw = f.readlines()
    
noise += sw

print(len(noise))

483


In [8]:
files = []
os.chdir("/home/alisa.khoroshavina/year.csv/")
for file in glob.glob("*.csv"):
    files.append(file)

In [9]:
len(files)

16

In [10]:
df = pd.read_csv(files[0], sep=',', names = list(range(0,5)), encoding='utf-8')
for i in range(1, len(files)):
    df = df.append(pd.read_csv(files[i], sep=',', names = list(range(0,5)), encoding='utf-8'))

In [11]:
#df.head()

In [12]:
#df.tail()

In [13]:
df.shape

(4042325, 5)

In [14]:
texts = list(df[3])

In [15]:
# preprocessing
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(texts)):
    texts[i] = removeNum(str(texts[i])).lower() 
    texts[i] = tokenizer.tokenize(texts[i]) 

In [16]:
txt = [[lemm(token) for token in doc if (nouns(token)) and (len(token) > 3)] for doc in texts]

docs = [[token for token in doc if token not in noise] for doc in txt]

In [17]:
"""
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
"""

"\n# Add bigrams and trigrams to docs (only ones that appear 20 times or more).\nbigram = Phrases(docs, min_count=20)\nfor idx in range(len(docs)):\n    for token in bigram[docs[idx]]:\n        if '_' in token:\n            # Token is a bigram, add to document.\n            docs[idx].append(token)\n"

In [18]:

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

# BOW representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
#gensim.corpora.Dictionary.save(dictionary, 'dictionary.dict')

print('file %d' %i)                            
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))     



file 4042324
Number of unique tokens: 7748
Number of documents: 4042325


In [19]:
# Train LDA model.
num_topics = 42
chunksize = 2000
passes = 40
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics

CPU times: user 4d 4h 3min 30s, sys: 6h 39min 28s, total: 4d 10h 42min 59s
Wall time: 7h 56min 12s


In [20]:
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)         



Average topic coherence: -14.4681.
[([(0.3378162, 'группа'),
   (0.13179298, 'вопрос'),
   (0.1046485, 'видео'),
   (0.09667732, 'музыка'),
   (0.08376844, 'тема'),
   (0.05710413, 'администрация'),
   (0.04365888, 'администратор'),
   (0.036828127, 'взлом'),
   (0.029506786, 'минута'),
   (0.014823727, 'описание'),
   (0.014069037, 'создание'),
   (0.0111717405, 'загрузка'),
   (0.00858618, 'ник'),
   (0.008196409, 'язык'),
   (0.0050249365, 'вконтакте'),
   (0.0032898774, 'свет'),
   (0.0031977184, 'опрос'),
   (0.0030309225, 'простой'),
   (0.0018451993, 'обработка'),
   (0.0016048258, 'рамочка')],
  -8.134058484859032),
 ([(0.18112387, 'фамилия'),
   (0.10755301, 'паспорт'),
   (0.086504675, 'фон'),
   (0.08298527, 'уведомление'),
   (0.065878116, 'личность'),
   (0.060616896, 'ирина'),
   (0.051816683, 'кошелёк'),
   (0.048441235, 'яндекс'),
   (0.04125461, 'начало'),
   (0.039864883, 'память'),
   (0.031057535, 'категория'),
   (0.027767194, 'ребята'),
   (0.027520483, 'олег'),
 

In [21]:
# Train LDA model.
num_topics = 6
chunksize = 2000
passes = 40
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model6 = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

top_topics6 = model6.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence6 = sum([t[1] for t in top_topics6]) / num_topics

CPU times: user 8h 28min 21s, sys: 12min 40s, total: 8h 41min 2s
Wall time: 4h 48min 45s


In [22]:
print('Average topic coherence: %.4f.' % avg_topic_coherence6)

from pprint import pprint
pprint(top_topics6)         



Average topic coherence: -4.8613.
[([(0.07301616, 'аккаунт'),
   (0.06279059, 'друг'),
   (0.0618271, 'фото'),
   (0.039905153, 'сообщение'),
   (0.037847664, 'человек'),
   (0.026218574, 'время'),
   (0.02507362, 'фотография'),
   (0.018758155, 'ссылка'),
   (0.018532624, 'просьба'),
   (0.018103696, 'пользователь'),
   (0.016074993, 'имя'),
   (0.014568619, 'информация'),
   (0.013612544, 'вопрос'),
   (0.012187711, 'поддержка'),
   (0.011232305, 'ответ'),
   (0.011033092, 'служба'),
   (0.010284146, 'сайт'),
   (0.00944199, 'уважение'),
   (0.00906602, 'сутки'),
   (0.008298538, 'доставка')],
  -3.6059850919225034),
 ([(0.11447515, 'деньга'),
   (0.105017744, 'заказ'),
   (0.081133194, 'карта'),
   (0.07012546, 'подарок'),
   (0.022755109, 'месяц'),
   (0.020185603, 'счёт'),
   (0.018130176, 'статус'),
   (0.017891807, 'услуга'),
   (0.017869743, 'лента'),
   (0.015558283, 'рубль'),
   (0.014855487, 'рождение'),
   (0.014492383, 'дата'),
   (0.014484533, 'окова'),
   (0.013714912, '

In [23]:
# Train LDA model.
num_topics = 7
chunksize = 2000
passes = 40
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model7 = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

top_topics7 = model7.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence7 = sum([t[1] for t in top_topics7]) / num_topics

CPU times: user 11h 15min 42s, sys: 19min 21s, total: 11h 35min 3s
Wall time: 4h 3min 53s


In [24]:
print('Average topic coherence: %.4f.' % avg_topic_coherence7)

from pprint import pprint
pprint(top_topics7)         



Average topic coherence: -5.1921.
[([(0.10347928, 'сайт'),
   (0.095140375, 'друг'),
   (0.09368055, 'фото'),
   (0.052869245, 'группа'),
   (0.044020597, 'проблема'),
   (0.027430715, 'пользователь'),
   (0.022871621, 'игра'),
   (0.021430962, 'причина'),
   (0.018466907, 'поддержка'),
   (0.016717417, 'служба'),
   (0.015384427, 'однокласник'),
   (0.015077662, 'лента'),
   (0.014306467, 'уважение'),
   (0.013109927, 'тема'),
   (0.012364006, 'покупка'),
   (0.011958632, 'список'),
   (0.011905642, 'дело'),
   (0.010164698, 'пост'),
   (0.010144973, 'раздел'),
   (0.009570879, 'оповещение')],
  -4.11909563266866),
 ([(0.3621485, 'страница'),
   (0.08855813, 'день'),
   (0.062538706, 'товар'),
   (0.02628546, 'адрес'),
   (0.023879247, 'год'),
   (0.023524137, 'фотография'),
   (0.022403449, 'вечер'),
   (0.017598957, 'ссылка'),
   (0.017387396, 'просьба'),
   (0.016881738, 'помощь'),
   (0.015081527, 'имя'),
   (0.014005476, 'возможность'),
   (0.013668314, 'информация'),
   (0.01053