In [1]:
import json
import re
from pprint import pprint
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [2]:
# Load conversations

In [3]:
with open("data/dialogs.json", mode="r") as file:
    dialogs_json = json.loads(file.read())

In [4]:
# Load conversation themes

In [5]:
er = {}
with open("data/dialogs_table_employee_remarks.sql", mode="r") as file:
    for line in file:
        if line[0] == '(':
            rec = line[1:-3].split(',')
            er[int(rec[0])] = rec[1].strip(" '").replace("\\","").replace("\"","")
pprint(er)

{1: 'нет оценки',
 236486: 'Заявка на подключение Интернет',
 236487: 'Консультация по подключению Интернет',
 236488: 'Консультация по КТВ',
 236489: 'Финансовая консультация',
 236490: 'Консультация по интерактивному телевидению',
 236491: 'Не работает ЛК приложение – рекомендация по работе',
 245949: 'ТТ Финансы',
 245950: 'ТТ Клиентские',
 245951: 'Общая проблема - информирование клиента',
 245952: 'Не работает интернет – рекомендация по настройке',
 245953: 'Смена тарифного плана',
 245954: 'Инсталляция',
 245955: 'Консультация по доп.услугам',
 245956: 'Прочее',
 246249: 'ТТ Телевидение',
 246250: 'Не работает КТВ – рекомендация по настройке',
 246251: 'Не работает Интерактивное ТВ – рекомендация по настройке',
 246252: 'Не работают прочие услуги – рекомендация по работе',
 247714: 'Возврат ДС/оборудования // Переоформление /расторжение договора'}


In [6]:
# Grouping conversation topics to group related topics

In [7]:
categories_to_er = {
    "connect_to_inet": [236486, 236487],
    "install": [245954],
    "finance": [236489, 245949],
    "no_LK": [236491],
    "no_ITV": [246251],
    "no_KTV": [246250],
    "low_speed": [245950],
    "no_internet": [245952],
    "termination_return": [247714],
    "tariff_changes": [245953],
    "TV": [246249],
    "KTV": [236488],
    "ITV": [236490],
    "additional": [245955],
#     "other": [245953, 246249, 236488, 236490, 245955]
}
er_to_categories = {}
for category, value in categories_to_er.items():
    er_to_categories = {**er_to_categories, **{er: category for er in value}}
pprint(er_to_categories)

{236486: 'connect_to_inet',
 236487: 'connect_to_inet',
 236488: 'KTV',
 236489: 'finance',
 236490: 'ITV',
 236491: 'no_LK',
 245949: 'finance',
 245950: 'low_speed',
 245952: 'no_internet',
 245953: 'tariff_changes',
 245954: 'install',
 245955: 'additional',
 246249: 'TV',
 246250: 'no_KTV',
 246251: 'no_ITV',
 247714: 'termination_return'}


In [8]:
# Create files with conversations by themes

In [9]:
datasets_dict = {}
for _, value in dialogs_json.items():
    if value["er"] in er_to_categories:
        category = er_to_categories[value["er"]]
        with open(f"data/{category}.csv", mode="a") as file:
            phrases = " ".join(value["visitors_phrases"][:2])
            print(f"1;{phrases};", file=file)

In [18]:
# Conversation example. visitors_phrases - customer phrases, agents_phrases - employee phrases

In [14]:
dialogs_json["50988"]

{'er': 236488,
 'site': 159859,
 'created_at': '2020-04-30 21:57:43',
 'visitors_phrases': ['Доьрый вечер',
  'Мне нужно отключить кабельное телевидение. Я пользуюсь только интернетом',
  'Г. Одинцово',
  'Подъезд 1',
  'Андреева Маргарита Николаевна',
  'Хорошо. Эту справку нужно отнести в ук?',
  'Нет'],
 'agents_phrases': ['Добрый вечер!',
  'Какой у вас вопрос?',
  'Уточните адрес (улица',
  'Для составления заявки',
  'Информацию зафиксировали. К вам будет направлен сотрудник',
  'Да можете сами направить',
  'Возможно чем-то еще могу вам помочь?',
  'Спасибо за Ваше обращение в нашу компанию. Всегда рады Вам помочь. Хорошего вечера!']}

In [19]:
# Write all phrases to one file.

In [16]:
all_phrases = []
for _, value in dialogs_json.items():
    phrases = value["visitors_phrases"] + value["agents_phrases"]
    all_phrases += phrases

In [17]:
with open(f"data/all_phrases.txt", mode="w") as file:
    for phrase in all_phrases:
        print(phrase, file=file)

# Preprocessing pipeline

In [20]:
# Bring the words associated with wi-fi to the same form

In [35]:
wifi_union = [("wi", "fi"), ("вай", "фай")]
wifi_naming = {"wifi": "вайфай", "wi-fi": "вайфай", "вайфая": "вайфай"}
def wifi_preprocessing(phrase: str) -> str:   
    words = [word.replace("фая", "фай") for word in phrase.split()]
    i = 0
    united_words = []
    while i<len(words):
        for pair in wifi_union:
            if i < len(words)-1 and words[i] == pair[0] and words[i+1] == pair[1]:
                united_words.append(words[i] + words[i+1])
                i += 2
                break
        else:
            united_words.append(words[i])
            i += 1
    words = [word if word not in wifi_naming else wifi_naming[word] for word in united_words]
    result_phrase = " ".join(words)
    return result_phrase

In [21]:
# Remove links

In [22]:
def remove_links(text: str) -> str:  
    return re.sub(r'(www|http:|https:)+[^\s]+[\w]', '', text)

In [23]:
# Remove all expected digits, russian letters, space, ., ! and ? 

In [24]:
def remove_punct_and_latin(text: str) -> str:  
    return re.sub(r'[^0-9А-ЯЁа-яё\s.!?]', '', text)

In [25]:
# Replace digits by symbol #

In [26]:
def replace_digits(text: str) -> str:
    for length in range(10,0,-1):
        pattern = r'\d{' + str(length) + '}'
        match = re.search(pattern, text)
        if match:
            text = re.sub(pattern, '#'*length, text)
    return text

In [27]:
# Converting words to normal form

In [28]:
exception_words = ['ловит', 'ловита', 'ловиту']
def normalize(text: str) -> str:
    normilized_sentence = []
    for word in text.split():
        if word not in exception_words:
            word_forms = morph.parse(word)
            if word_forms:
                normilized_sentence.append(word_forms[0].normal_form)
        else:
            normilized_sentence.append(word)
    return " ".join(normilized_sentence)

In [29]:
# Split phrase by sentences

In [30]:
def split_by_sentence(separators, phrases):   
    tmp = phrases
    for sep in separators:
        result = []
        for phrase in list(map(lambda x: x.split(sep), tmp)):
            result += phrase
        tmp = result
    return [phrase.strip() for phrase in result if phrase.strip() != ""]

In [31]:
# Remove stop words

In [32]:
def remove_stop_words(text: str) -> str:
    stop_words = ['здравствуйте', 'добрый день', 'доброе утро', 'доброй ночи', 'добрый вечер', 'привет', 'спасибо',
                  'добрый утро', 'добрый ночи', 'досвидание', 'до свидание', 'всего хорошего', 'всего хороший']
    for word in stop_words:
        pattern = r'' + word + ''
        text = re.sub(pattern, '', text)
    return text    

In [33]:
# Make transformation

In [36]:
all_processing_phrases = []
for _, value in dialogs_json.items():
    phrases = value["visitors_phrases"] + value["agents_phrases"]
    phrases = list(map(lambda x: remove_links(x), phrases))
    phrases = list(map(lambda x: wifi_preprocessing(x), phrases))
    phrases = list(map(lambda x: remove_punct_and_latin(x), phrases))
    phrases = list(map(lambda x: replace_digits(x), phrases))
    phrases = list(map(lambda x: x.lower(), phrases))
    phrases = split_by_sentence(".!?", phrases)
    phrases = list(map(lambda x: normalize(x), phrases))
    phrases = list(map(lambda x: remove_stop_words(x), phrases))
    phrases = [phrase.strip() for phrase in phrases if phrase.strip() !=""]
    all_processing_phrases += phrases

In [37]:
len(all_processing_phrases)

983896

In [38]:
with open(f"data/all_processing_phrases.txt", mode="w") as file:
    for phrase in all_processing_phrases:
        print(phrase, file=file)

# Stats

In [39]:
from statistics import mean, quantiles
import numpy as np

In [40]:
lens = []
for phrase in all_processing_phrases:
    lens.append(len(phrase.split()))

In [41]:
mean(lens)

3.9290799027539496

In [42]:
quantiles(lens)

[2.0, 3.0, 6.0]

In [None]:
# See to the sentences length destribution 

In [43]:
np.bincount(lens)

array([     0, 234766, 178843, 111804, 123173,  88056,  83683,  52917,
        39364,  21908,  13754,  11007,   6875,   4174,   4267,   2335,
         2560,   1090,   1133,    496,    331,    211,    203,    221,
          232,     69,     74,     94,     84,     30,     29,     15,
           18,      5,      8,      9,      4,     13,      6,      1,
            0,      6,      2,      5,      4,      0,      1,      1,
            1,      0,      2,      0,      1,      0,      1,      0,
            0,      0,      0,      0,      0,      0,      1,      1,
            1,      0,      0,      0,      2,      0,      0,      0,
            0,      0,      0,      1,      1,      1,      0,      0,
            0,      0,      0,      2])

In [None]:
# Most single word sentences ! This sentences meaningless

# Remove phrase with length 1

In [44]:
processing_phrases_more_one_word = []
for phrase in all_processing_phrases:
    if len(phrase.split()) > 1:
        processing_phrases_more_one_word.append(phrase)

In [45]:
len(processing_phrases_more_one_word)

749130

In [46]:
with open(f"data/processing_phrases_more_one_word.txt", mode="w") as file:
    for phrase in processing_phrases_more_one_word:
        print(phrase, file=file)