In [709]:
import re
import pandas as pd
from pymystem3 import Mystem
from gensim.models import Word2Vec
from sklearn.neighbors import NearestNeighbors
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import random
from tqdm import tqdm

In [710]:
def stemming(tokens: list) -> pd.Series:
    stem = Mystem()
    tokens = [word_tokenize("".join(stem.lemmatize(sentence))) for sentence in tokens]
    tokens = [[word for word in sentence if len(word) > 2] for sentence in tokens]
#     tokens = [
#         [
#             word
#             for word in sentence
#             if morph.parse(word)[0].tag.POS == "NOUN"
#             or morph.parse(word)[0].tag.POS == "ADJF"
#         ]
#         for sentence in tokens
#     ]
    return tokens

In [711]:
def check_stopwords(text: pd.Series) -> pd.Series:
    result = [
        [word for word in sentence if word not in ru_stopwords and word != " "]
        for sentence in text
    ]
    return result

In [712]:
ru_stopwords = stopwords.words("russian")
final_df = pd.read_csv('dataset.csv')

final_df['clean_text'] = final_df.text.apply(lambda x: re.sub('[0-9]+', '', x))
final_df['clean_text'] = final_df.clean_text.apply(lambda x: re.sub('[^а-яА-Я]+', ' ', x))
final_df['clean_text'] = final_df.clean_text.apply(lambda x: x.lower())

In [713]:
%%time
lemmatized_tokens = stemming(final_df.clean_text)
lemmatized_tokens = check_stopwords(lemmatized_tokens)
final_df['clean_text'] = lemmatized_tokens

CPU times: user 1min 18s, sys: 3.04 s, total: 1min 21s
Wall time: 3min 10s


In [714]:
model = Word2Vec(sentences=lemmatized_tokens, size=50, window=10, min_count=3, workers=-1, seed=42)

In [715]:
outbox_df = pd.DataFrame(final_df[final_df['type'] == 'outbox']).reset_index(drop=True)
outbox_vectors = []

for text in outbox_df.clean_text:
    tmp = []
    for word in text:
        try:
            tmp.append(model.wv.get_vector(word))
        except:
#             print(word)
            pass
    if len(tmp) != 0:
        outbox_vectors.append(sum(tmp) / len(tmp))
    else:

        outbox_vectors.append([None for _ in range(50)])

In [716]:
clean_emails = []
for i in outbox_df['to']:
    try:
        clean_emails.append(re.search(r'[0-9A-Za-z\.-]+@[\.a-z]+', i).group(0))
    except AttributeError:
        pass

In [717]:
outbox_df['to_clean'] = clean_emails
modelling = pd.DataFrame(outbox_vectors, index=outbox_df.index)
modelling['email'] = outbox_df.to_clean
to_drop = modelling[modelling[0].isnull()].index
modelling = modelling.drop(index = to_drop ).reset_index(drop=True)
outbox_df = outbox_df.drop(index = to_drop ).reset_index(drop=True)
X = modelling.copy()#.drop(columns=['email'])
clean_emails = pd.DataFrame({'emails': clean_emails})
test_index = X[X.email.isin(clean_emails.emails.value_counts()[:3].keys().tolist())].index
random.seed(65)
test_index = random.sample(list(test_index), k=30)
X.drop(columns=['email'], inplace=True)

In [718]:
test = X.loc[test_index, :]
train = X[~X.index.isin(test_index)]
neigh = NearestNeighbors(n_neighbors=5, radius=0.01)
neigh.fit(train)
preds = neigh.kneighbors(test)

In [719]:
filtered_rad = []
filtered_ind = []

for row in range(len(preds[0])):
    tmp = []
    tmp1 = []
    for ind in range(len(preds[0][row])):
        if preds[0][row][ind] < 0.11:
            tmp.append(preds[0][row][ind])
            tmp1.append(preds[1][row][ind])
    filtered_rad.append(tmp)
    filtered_ind.append(tmp1)

In [720]:
acc = []
for i, ind in enumerate(test.index):
    email = outbox_df.at[ind, 'to_clean']
    if email in modelling.loc[filtered_ind[i], 'email'].tolist():
        acc.append(1)
    else:
        acc.append(0)

In [708]:
sum(acc)/len(acc)

0.6

In [629]:
len(acc)

30

In [630]:
sum(acc)

16

In [692]:
res = []
for rand in tqdm(range(100)):     
    outbox_df = pd.DataFrame(final_df[final_df['type'] == 'outbox']).reset_index(drop=True)
    outbox_vectors = []

    for text in outbox_df.clean_text:
        tmp = []
        for word in text:
            try:
                tmp.append(model.wv.get_vector(word))
            except:
    #             print(word)
                pass
        if len(tmp) != 0:
            outbox_vectors.append(sum(tmp) / len(tmp))
        else:
            outbox_vectors.append([None for _ in range(50)])
    clean_emails = []
    for i in outbox_df['to']:
        try:
            clean_emails.append(re.search(r'[0-9A-Za-z\.-]+@[\.a-z]+', i).group(0))
        except AttributeError:
            pass
    outbox_df['to_clean'] = clean_emails
    modelling = pd.DataFrame(outbox_vectors, index=outbox_df.index)
    modelling['email'] = outbox_df.to_clean
    to_drop = modelling[modelling[0].isnull()].index
    modelling = modelling.drop(index = to_drop ).reset_index(drop=True)
    outbox_df = outbox_df.drop(index = to_drop ).reset_index(drop=True)
    X = modelling.copy()#.drop(columns=['email'])
    clean_emails = pd.DataFrame({'emails': clean_emails})
    test_index = X[X.email.isin(clean_emails.emails.value_counts()[:3].keys().tolist())].index
    random.seed(rand)
    test_index = random.sample(list(test_index), k=30)
    X.drop(columns=['email'], inplace=True)
    test = X.loc[test_index, :]
    train = X[~X.index.isin(test_index)]
    neigh = NearestNeighbors(n_neighbors=10, radius=0.01)
    neigh.fit(train)
    preds = neigh.kneighbors(test)
    filtered_rad = []
    filtered_ind = []

    for row in range(len(preds[0])):
        tmp = []
        tmp1 = []
        for ind in range(len(preds[0][row])):
            if preds[0][row][ind] < 0.11:
                tmp.append(preds[0][row][ind])
                tmp1.append(preds[1][row][ind])
        filtered_rad.append(tmp)
        filtered_ind.append(tmp1)
    acc = []
    for i, ind in enumerate(test.index):
        email = outbox_df.at[ind, 'to_clean']
        if email in modelling.loc[filtered_ind[i], 'email'].tolist():
            acc.append(1)
        else:
            acc.append(0)
    res.append(sum(acc)/len(acc))

100%|██████████| 100/100 [00:46<00:00,  2.14it/s]


In [693]:
max(res)

0.9

In [695]:
np.argmax(res)

65

In [722]:
modelling.to_csv('modelling_dots.csv', index=False)

In [723]:
text = 'Это финал цифрового прорыва, сроки спасибо с уважением'
s = pd.Series(text)

In [724]:
s

0    Это финал цифрового прорыва, сроки спасибо с у...
dtype: object

In [728]:
def text_pr(text):
    s = pd.Series(text)
    s = s.apply(lambda x: re.sub('[0-9]+', '', x))
    s = s.apply(lambda x: re.sub('[^а-яА-Я]+', ' ', x))
    s = s.apply(lambda x: x.lower())
    s = stemming(s)
    s = check_stopwords(s)
    return ' '.join(s)

In [729]:
def dots(text):
    tmp = []
    for word in text:
        try:
            tmp.append(model.wv.get_vector(word))
        except:
#             print(word)
            pass
    if len(tmp) != 0:
        outbox_vectors.append(sum(tmp) / len(tmp))
    else:

        outbox_vectors.append([None for _ in range(50)])
    