Код токинизации фраз

In [None]:
import json
import re
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_extraction import text
import nltk
from nltk.corpus import stopwords
import pymorphy2
from IPython.display import Markdown, display, HTML

nltk.download("stopwords")

freq_limit = 86400 / 2
time_start = int(datetime.datetime.strptime('01-01-2020', '%d-%m-%Y').strftime("%s"))

morph = pymorphy2.MorphAnalyzer(lang='ru')
data = json.load(open('vacancy.json'))

locals_dict = dict.fromkeys(['user127586289','user68513872','user1471788','user125042168','user92008355','user125418312','user100782243','user82639411','user153030146',
'user291176400','user73173481','user551102','user188075071','user53629404','user1032579','user194201144','user162658742','user158187166',
'user116682853','user82230092','user113976012','user3797167','user94299908','user230762864','user119852384','user202423560','user407081565',
'user115002256','user543084133','user163171491'], True)

stop_words = stopwords.words("russian")
stop_words.extend([
    'умение', 'навык', 'уверенное', 'владение', 'знание', 'опыт', 'умение', 'работать', 'навык', 'работы', 
    'понимание', 'глубокое', 'владение', 'осведомленность', 'уровень', 'знаний', 'практический', 'опыт',
    'профессиональное', 'владение', 'применять', 'основ', 'анализировать', 'уверенность'
])

stop_words_dict = dict.fromkeys(stop_words, True)

def printmd(string):
    display(Markdown(string))

def preprocess(text):
    text = re.sub("[.,\-!?:()\[\]]+", " ", text)
    
    result = ''
    
    for w in text.split():
        w = morph.parse(w.lower())[0].normal_form
        
        if w in stop_words_dict:
            continue
        
        if result: result += ' '
        result += w
    
    return result

member_time = {}
messages = []

for m in data['messages']:
    t = int(m['date_unixtime'])
    
    if t < time_start:
        continue

    if 'from_id' in m and t:
        if m['from_id'] in locals_dict:
            continue
            
        if m['from_id'] not in member_time or t - member_time[m['from_id']] > freq_limit:
            msg = ''
            for item in m['text_entities']:
                if item['type'] == 'plain':
                    msg += item['text']
            
            if len(msg) > 40:
                msg = re.sub("\n{1,}", " ", msg)
                msg = msg.strip("\n ")
                msg = re.sub(" {1,}", " ", msg)
                messages.append(msg)
                
        member_time[m['from_id']] = t
        
print(len(messages))

Обучение модели

In [None]:
vectorizer = TfidfVectorizer()
processed_messages = []

for m in messages:
    processed_messages.append(preprocess(m))
    
x = vectorizer.fit_transform(processed_messages)

true_k = 30
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=2000, n_init=50, tol=1e-1)
model.fit(x)

Визуализируем результат

In [None]:
terms = vectorizer.get_feature_names()

clustered = {}

for m in messages:
    y = vectorizer.transform([preprocess(m)])
    p = model.predict(y)[0]
    
    if p not in clustered:
        clustered[p] = [m]
    else:
        clustered[p].append(m)
        
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(0,true_k):
    printmd ("**Кластер {n} ({l})**".format(n=i, l=len(clustered[i])))
    
    w = ''
    for c in order_centroids[i, :10]:
            w += '<span style="background-color: #EEEEEE; margin: 3px">%s</span>' % terms[c]
        
    display(HTML(w))
    
    for m in clustered[i][0:15]:
        printmd('* %s' % m[0].upper() + m[1:])
    print ()
    print ()

Смотрим размерность векторов в данной модели

In [None]:
vectorizer = TfidfVectorizer()

message = 'Умение отлаживать пайплайны обработки данных в эирфлоу'
print(vectorizer.fit_transform([message]))

А теперь посмотрим размерность векторов при использовании spaСy

In [None]:
import spacy
nlp = spacy.load("ru_core_news_lg")

message = 'Умение отлаживать пайплайны обработки данных в эирфлоу'
print(nlp(message).vector.shape)