In [1]:
import numpy as np
import pandas as pd

import itertools
import math

import matplotlib.pyplot as plt
from matplotlib import gridspec

from tqdm.notebook import tqdm
import pymorphy2

import nltk
from nltk.tokenize import RegexpTokenizer

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

from sklearn.feature_extraction.text import CountVectorizer

import tqdm

from pymystem3 import Mystem

from Levenshtein import distance as lev

import yaml

%load_ext autoreload
%autoreload

In [2]:
with open('topics.yaml', 'r') as file:
    topics_signature = yaml.safe_load(file)

In [3]:
FL = '../dat/glue_all_but_owner.txt'

topics_signature

{'QUESTION': [['?'], ['подскажите'], ['есть'], ['условия']],
 'BUY': [['хочу', 'купить'], ['рассрочк'], ['сколько', 'стоит'], ['подарок']],
 'REVIEW': [['спасибо']],
 'ADVERTISEMENT': [['реклама']],
 'IPHONE': [['iphone'], ['айфон'], ['apple'], ['эппл']]}

In [4]:
def only_alpha_from_string(x):
    return ''.join(a for a in x if (a.isalpha() or a == ' '))


def get_min_dist(msg, tag):
    min_lev = 1e6
    for word in msg:
        min_lev = min(lev(word, tag), min_lev)
    return min_lev


def check_tag(message_text, tag_content, lev_threshold=3):
    fail_flg = False
    for tag in tag_content:
        if get_min_dist(message_text, tag) > lev_threshold:
            fail_flg = True
            break
    return not fail_flg

def check_topic(message_text, tags):
    has_topic = False
    relevant_tags = []
    for tag in tags:
        if check_tag(message_text, tag):
            has_topic = True
            relevant_tags.append(tag)
    return has_topic, relevant_tags


def check_message(message_text, config):
    message_text = only_alpha_from_string(message_text).lower().split()
    result = {}
    for topic_name in config:
        check_result, tags = check_topic(message_text, config[topic_name])
        if check_result:
            result[topic_name] = tags
    return result



In [5]:
with open(FL, 'r') as fl:
    messages = fl.readlines()
    messages = [m.replace('\n', '') for m in messages]


In [1]:
ehehe = ['сколько стоят наушники в рассрочку?']

In [7]:
topic_separation = []

for ms in tqdm.tqdm(messages):
    topic_separation.append(check_message(ms, topics_signature))

100%|██████████| 31772/31772 [00:05<00:00, 6191.14it/s]


In [8]:
topic_separation

[{'QUESTION': [['?'], ['есть']],
  'BUY': [['хочу', 'купить'], ['сколько', 'стоит']],
  'IPHONE': [['айфон'], ['эппл']]},
 {'QUESTION': [['?'], ['есть']],
  'BUY': [['рассрочк']],
  'IPHONE': [['айфон'], ['эппл']]},
 {'QUESTION': [['?'], ['подскажите'], ['есть']]},
 {'QUESTION': [['?'], ['есть'], ['условия']],
  'BUY': [['рассрочк'], ['сколько', 'стоит']]},
 {'QUESTION': [['?'], ['есть']],
  'BUY': [['рассрочк'], ['сколько', 'стоит']],
  'IPHONE': [['айфон']]},
 {'QUESTION': [['?'], ['есть'], ['условия']],
  'BUY': [['рассрочк']],
  'IPHONE': [['iphone']]},
 {'QUESTION': [['?'], ['есть']]},
 {'QUESTION': [['?']]},
 {'QUESTION': [['?'], ['подскажите'], ['есть']],
  'BUY': [['хочу', 'купить'], ['рассрочк'], ['сколько', 'стоит']],
  'IPHONE': [['айфон'], ['эппл']]},
 {'QUESTION': [['?'], ['есть']], 'BUY': [['рассрочк']]},
 {'QUESTION': [['?'], ['есть'], ['условия']], 'BUY': [['рассрочк']]},
 {'QUESTION': [['?']], 'IPHONE': [['iphone'], ['apple']]},
 {'QUESTION': [['?'], ['есть']]},
 {'QUE