In [5]:
import sklearn
sklearn.__version__

'1.4.2'

In [6]:
import pandas as pd
import sys
import glob
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import random
from collections import Counter
from striprtf.striprtf import rtf_to_text
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from sklearn.utils.class_weight import compute_class_weight
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
def remove_russian_stopwords(text):
    russian_stopwords = stopwords.words("russian")
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in russian_stopwords]
    return ' '.join(filtered_words)

In [8]:
MAPPING = {
    "proxy": "доверенность",
    "contract": "договор",
    "act": "акт",
    "application": "заявление",
    "order": "приказ",
    "invoice": "счет",
    "bill": "приложение",
    "arrangement": "соглашение",
    "contract offer": "договор оферты",
    "statute": "устав",
    "determination": "решение",
}
etor = MAPPING
rtoe = {value: key for key, value in MAPPING.items()}

In [9]:
CLASSES_= list(MAPPING.values())

In [10]:
data = pd.read_csv('../data/lemma_sample.csv', index_col=0)
data['text'] = data['text'].apply(lambda x: remove_russian_stopwords(x))

In [11]:
for class_ in CLASSES_:
    data[class_] = data['text'].str.contains(class_)

In [12]:
data['class'].value_counts()

class
proxy             71
contract          70
act               69
application       61
order             50
invoice           43
bill              41
arrangement       40
contract offer    25
statute           21
determination     10
Name: count, dtype: int64

In [13]:
train_data, test_data = train_test_split(
    data,
    test_size=0.25,
    random_state=42,
    stratify=data["class"],
)

--- 
## Фичи - названия классов в тексте

In [14]:
model = RandomForestClassifier(random_state=42)
model.fit(train_data[CLASSES_], train_data['class'])

y_true = test_data['class']
y_pred = model.predict(test_data[CLASSES_])

In [15]:
print(
    f'acc: {accuracy_score(y_true, y_pred): .4f}\n'
    f'f1 : {f1_score(y_true, y_pred, average="macro"): .4f}\n'
)

acc:  0.7937
f1 :  0.7423



--- 
## Статистика по встречаемости

In [16]:
freq = Counter(' '.join(train_data['text'].values.tolist()).split())
stat = train_data.groupby('class', as_index=False).agg(
    {'text': lambda x: ' '.join(x)}
)
stat['words'] = stat['text'].apply(lambda x: Counter(x.split()))
stat['top-10'] = stat['words'].apply(lambda x: x.most_common(10))

metric = {}
for class_ in tqdm(CLASSES_):

    value = {}
    for word, fr in stat[
        stat['class'] == rtoe[class_]
    ]['top-10'].values.tolist()[0]:
        value[word] = fr / freq[word]
        
    metric[class_] = dict(sorted(value.items(), key=lambda item: item[1], reverse=True))
    
value = pd.DataFrame(metric.items(), columns=['class', 'value'])
value['class'] = value['class'].replace(rtoe)
stat = stat.merge(value, on='class')
stat = stat.set_index('class')

100%|██████████| 11/11 [00:00<00:00, 2498.10it/s]


In [17]:
stat.to_csv('../data/stat.csv', index=False)

In [18]:
for class_ in CLASSES_:
#     print(class_, stat.loc[rtoe[class_], 'top-10'])
    print(class_, stat.loc[rtoe[class_], 'value'])
    print()

доверенность {'доверенность': 0.7077625570776256, 'зарегистрировать': 0.6747967479674797, 'выдать': 0.6549019607843137, 'паспорт': 0.5522388059701493, 'право': 0.2463768115942029, 'документ': 0.2159090909090909, 'весь': 0.20649651972157773, 'год': 0.20359281437125748, 'адрес': 0.1806083650190114, 'лицо': 0.10810810810810811}

договор {'заказчик': 0.8075117370892019, 'далее': 0.5704697986577181, 'работа': 0.5390243902439025, 'установить': 0.496, 'оплата': 0.4229934924078091, 'соглашение': 0.4058679706601467, 'сторона': 0.3934142114384749, 'акт': 0.33687002652519893, 'договор': 0.2735166425470333, 'настоящий': 0.17380025940337224}

акт {'комиссия': 0.7045454545454546, 'составить': 0.5769230769230769, 'акт': 0.4270557029177719, 'год': 0.18962075848303392, 'директор': 0.16382252559726962, 'сторона': 0.10918544194107452, 'подпись': 0.09683794466403162, 'настоящий': 0.08430609597924774, 'лицо': 0.07396870554765292, 'договор': 0.06222865412445731}

заявление {'просить': 1.0, 'заявление': 0.45

In [19]:
top_words_ = [
    list(dct.keys()) for dct in stat['value'].values.tolist()
]
top_words = []
for lst in top_words_:
    top_words.extend(lst)
    
top_words = [i for i in set(top_words) if len(i) > 2]

--- 
## Фичи по встречаемости

In [20]:
data = pd.read_csv('../data/lemma_sample.csv', index_col=0)
data['text'] = data['text'].apply(lambda x: remove_russian_stopwords(x))

for top in top_words:
    data[top] = data['text'].str.findall(top).str.len()

train_data, test_data = train_test_split(
    data,
    test_size=0.25,
    random_state=42,
)

In [21]:
model = RandomForestClassifier(random_state=42)
model.fit(train_data[top_words], train_data['class'])

y_true = test_data['class']
y_pred = model.predict(test_data[top_words])

In [22]:
print(
    f'acc: {accuracy_score(y_true, y_pred): .4f}\n'
    f'f1 : {f1_score(y_true, y_pred, average="macro"): .4f}\n'
)

acc:  0.9683
f1 :  0.8706



--- 

## Добавим веса классов

In [23]:
weights = compute_class_weight(
    class_weight="balanced", 
    classes=train_data["class"].unique(), 
    y=train_data["class"],
)

weights_dict = dict(zip(
    train_data["class"].unique(), 
    weights
))

In [24]:
model = RandomForestClassifier(class_weight=weights_dict, random_state=42)
model.fit(train_data[top_words], train_data['class'])

y_true = test_data['class']
y_pred = model.predict(test_data[top_words])

In [25]:
print(
    f'acc: {accuracy_score(y_true, y_pred): .4f}\n'
    f'f1 : {f1_score(y_true, y_pred, average="macro"): .4f}'
)

acc:  0.9444
f1 :  0.8500


---
## Расширенный датасет

In [26]:
dataset_path = '../data/lemma_sample.csv'
external_dataset_path = '../data/lemma_external_with_samplelabel.csv'

In [27]:
data = pd.read_csv(dataset_path, index_col=0)
external_data = pd.read_csv(
    external_dataset_path, index_col=0
)
# external_data['clear_text_in_sample'] = (
#     pd.read_csv('../data/external_with_samplelabel.csv')
#     ['clear_text_in_sample']
# )

external_data = external_data[
    external_data["clear_text_in_sample"] == 0][["class", "text"]]
data = pd.concat([data, external_data])

In [28]:
train_data, test_data = train_test_split(
    data,
    test_size=0.25,
    random_state=42,
)

In [47]:
freq = Counter(' '.join(train_data['text'].values.tolist()).split())
stat = train_data.groupby('class', as_index=False).agg(
    {'text': lambda x: ' '.join(x)}
)
stat['words'] = stat['text'].apply(lambda x: Counter(x.split()))
stat['top-10'] = stat['words'].apply(lambda x: x.most_common(10))

metric = {}
for class_ in tqdm(CLASSES_):

    value = {}
    for word, fr in stat[
        stat['class'] == rtoe[class_]
    ]['top-10'].values.tolist()[0]:
        value[word] = fr / freq[word]
        
    metric[class_] = dict(sorted(value.items(), key=lambda item: item[1], reverse=True))
    
value = pd.DataFrame(metric.items(), columns=['class', 'value'])
value['class'] = value['class'].replace(rtoe)
stat = stat.merge(value, on='class')
stat = stat.set_index('class')

top_words_ = [
    list(dct.keys()) for dct in stat['value'].values.tolist()
]
top_words = []
for lst in top_words_:
    top_words.extend(lst)
    
top_words = sorted([i for i in set(top_words) if len(i) > 2])

100%|██████████| 11/11 [00:00<00:00, 957.15it/s]


In [48]:
for top in top_words:
    data[top] = data['text'].str.findall(top).str.len()
    
train_data, test_data = train_test_split(
    data,
    test_size=0.25,
    random_state=42,
)

In [49]:
model = RandomForestClassifier(
    class_weight=weights_dict, random_state=42
)
model.fit(train_data[top_words], train_data['class'])

y_true = test_data['class']
y_pred = model.predict(test_data[top_words])

In [50]:
train_data[top_words]

Unnamed: 0,адрес,акт,бальмонт,валюта,весь,выдать,год,город,груз,дата,...,товар,требовать,указать,условие,услуга,устав,уставный,участник,часть,электронный
104,0,4,0,0,0,0,1,0,0,0,...,0,1,1,3,0,1,0,0,0,0
163,4,3,0,0,5,5,2,0,0,1,...,0,0,0,1,0,0,0,0,0,0
199,1,2,0,0,0,0,1,0,0,0,...,0,2,0,0,0,1,0,0,0,0
325,2,0,0,0,0,0,1,0,0,0,...,0,3,1,2,0,0,0,0,0,0
266,1,5,0,0,0,0,0,0,0,0,...,0,0,1,0,0,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,2,2,0,0,1,2,4,4,0,0,...,0,2,1,0,0,0,0,0,0,0
876,1,4,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,0,0,0,0
387,1,0,0,0,0,0,1,1,0,0,...,0,3,0,0,2,0,0,0,0,0
1041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [56]:
print(
    f'acc: {accuracy_score(y_true, y_pred): .4f}\n'
    f'f1 : {f1_score(y_true, y_pred, average="macro"): .4f}'
)

acc:  0.9733
f1 :  0.9039


In [57]:
import joblib
filename = '../data/model.sav'
joblib.dump(model, filename)


['../data/model.sav']

---
## Инференс

In [58]:
stat.to_csv('../data/stat.csv', index=False)

In [59]:
import pandas as pd
import joblib

def predict(
    test_df: pd.DataFrame, 
    model_path: str = '../data/model.sav'
) -> list[str]:
    """
    Inference
    
    :param test_df: columns - [['text']]. Should be processed!
    
    :return: list[class_]
    """

    loaded_model = joblib.load(model_path)
    
    top_words = [
        "адрес", "акт", "бальмонт", "валюта", "весь", "выдать", "год", 
        "город", "груз", "дата", "действовать", "деятельность", "директор", 
        "доверенность", "договор", "документ", "доля", "единственный", "закон", 
        "зарегистрировать", "заявление", "или", "именовать", "имущественный", "код", 
        "комиссия", "контроль", "лицо", "масса", "место", "мой", "наименование", 
        "настоящий", "номер", "общество", "ограничить", "ознакомить", "оплата", 
        "ответственность", "отдел", "оферта", "паспорт", "подпись", "покупатель", 
        "право", "при", "приказ", "приказывать", "просить", "работа", "работник", 
        "регистрация", "решение", "руб", "случай", "соглашение", "составить", 
        "составлять", "срок", "сторона", "сумма", "товар", "требовать", "указать", 
        "условие", "услуга", "устав", "уставный", "участник", "часть", "электронный",
    ]
    
    for top in top_words:
        test_df[top] = test_df['text'].str.findall(top).str.len()
    
    from IPython.display import display

    test_df = test_df.drop(columns=['text'])
    display(test_df)
    preds = loaded_model.predict(test_df[top_words]).tolist()
    return preds


In [60]:
test_text = "Директору ООО «Кошкин дом» Денисову Олегу Викторовичу \n От оператора горячей линии Бальмонт Сергея Валерьевича \n Заявление о переводе на полную ставку\n По условиям трудового договора от 01.03.2025 г. я работаю на 0.5 ставки, по 4 часа в день. В связи с появлением большого количества рабочего времени, прошу с 01.10.2025 г. перевести меня на полную ставку.28 сентября 2025 г. Бальмонт Сергей Валерьевич  "
test_df = pd.DataFrame({'text': [test_text]})
predict(test_df)

Unnamed: 0,адрес,акт,бальмонт,валюта,весь,выдать,год,город,груз,дата,...,товар,требовать,указать,условие,услуга,устав,уставный,участник,часть,электронный
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


['application']