In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download("stopwords")
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuriy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuriy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
input_data = pd.read_csv(r"C:\Users\yuriy\Downloads\ecommerce_data_set.csv", sep=",", header=None)

In [57]:
input_data.columns = ['class_type', 'naming']
input_data = input_data.dropna(subset=['naming'])
input_data.head(10)

Unnamed: 0,class_type,naming
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
5,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
6,Household,Paper Plane Design Starry Night Vangoh Wall Ar...
7,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
8,Household,SAF 'Ganesh Modern Art Print' Painting (Synthe...
9,Household,Paintings Villa UV Textured Modern Art Print F...


In [58]:
x_train, x_test, y_train, y_test = train_test_split(input_data['naming'], input_data['class_type'], test_size=0.3, random_state=1)

In [59]:
def tokenizer(sentence: str, lang: str = 'english'):
    stop_words = stopwords.words(lang)
    snowball = SnowballStemmer(language=lang)
    tokens = word_tokenize(sentence, language=lang)
    tokens = [i for i in tokens if i not in string.punctuation]
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [60]:
model_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda x: tokenizer(x))),
    ('model', DecisionTreeClassifier(
        criterion='gini',  # Выберите нужный критерий (gini или entropy)
        max_depth=5,       # Настройте максимальную глубину
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=None,
        splitter='best',
        min_impurity_decrease=0.0
    ))
]
)

In [61]:
model_pipeline.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002711CB4F310>)),
                ('model', DecisionTreeClassifier(max_depth=5))])

In [62]:
model_pipeline.predict(['Asus tuf gaming motherboard'])

array(['Household'], dtype=object)

In [63]:
precision_score(y_true=y_test, y_pred=model_pipeline.predict(x_test), average='micro', pos_label='positive')



0.6849550502379693

In [64]:
print(classification_report(y_test, model_pipeline.predict(x_test)))

                        precision    recall  f1-score   support

                 Books       0.96      0.50      0.65      3460
Clothing & Accessories       0.92      0.64      0.76      2562
           Electronics       0.94      0.39      0.55      3168
             Household       0.56      0.97      0.71      5938

              accuracy                           0.68     15128
             macro avg       0.85      0.62      0.67     15128
          weighted avg       0.79      0.68      0.67     15128



In [65]:
cv_result =  cross_val_score(model_pipeline, x_train, y_train, cv=5 )

In [66]:
print(cv_result)

[0.67988669 0.67431648 0.67360816 0.68437456 0.6680833 ]


### Обучаем модель на наших данных

In [67]:
input_data2 = pd.read_csv(r"C:\Users\yuriy\Downloads\duty_stats_raw_data.csv", sep=",")

In [68]:
input_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          235 non-null    int64 
 1   user_name      233 non-null    object
 2   user_group_id  235 non-null    object
 3   message        235 non-null    object
 4   Task type      235 non-null    object
 5   Task subtype   235 non-null    object
 6   Resolution     235 non-null    object
 7   event_date     235 non-null    object
 8   task_status    235 non-null    object
dtypes: int64(1), object(8)
memory usage: 16.6+ KB


In [69]:
input_data2.head(10)

Unnamed: 0,index,user_name,user_group_id,message,Task type,Task subtype,Resolution,event_date,task_status
0,241,rostepanov,ASD Project,Привет!\n в отчете [Аквизишн Овервью](https://...,Вопрос по данным,Вопрос по логике отчета,Ответ,2023-06-19,done
1,408,epkuznetsova,ASD Project,Привет!\n Сейчас тестируем в команде Хантеры c...,Запрос на корректировку отчета / некорректная ...,Вопрос о возможности доработки отчета,Доработка не реальна на момент обращения,2023-08-17,done
2,237,assementsov,Analytic,"Гриша, привет\n подскажи пожалуйста, считаем л...",Вопрос по данным,Вопрос по данным битрикса,Ответ,2023-06-16,done
3,390,rostepanov,ASD Project,"Коллеги, привет!\n Отчет [Аквизишн овервью](ht...",Вопрос по данным,Вопрос по логике отчета,Ответ,2023-08-14,done
4,240,aadenisova,Analytic,"Коллеги, привет! \n Подскажите, где в витринах...",Вопрос по данным,Вопрос по логике витрины,Ответ,2023-06-16,done
5,300,gvponomarev,Analytic,"@iashmelev , @inkaraivanova\n \n > У нас с Иро...",Вопрос по данным,Вопрос по логике витрины,Ответ,2023-07-10,done
6,281,vagarkavaya,Analytic,Привет! Помогите пожалуйста разобраться с флаг...,Вопрос по данным,Вопрос по логике витрины,Ответ,2023-06-30,done
7,434,aazakharov,ASD KAM,"Коллеги, привет.\n У нашего клиента - [6168966...",Вопрос по отчету,Вопрос по логике отчета,Ответ,2023-08-28,done
8,298,asazarin,ASD KAM,"Коллеги, доброе утро.\n \n Сводный отчёт - htt...",Вопрос по отчету,Вопрос по логике отчета,Ответ,2023-07-10,done
9,372,vvdergilev,Analytic,Привет! Подскажите откуда лучше всего взять те...,Вопрос по данным,Вопрос по наличию данных,Ответ,2023-08-07,done


In [70]:
input_data2 = input_data2[['Task type', 'message']]

In [71]:
x_train, x_test, y_train, y_test = train_test_split(input_data2['message'], input_data2['Task type'], test_size=0.3, random_state=1)

In [72]:
model_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda x: tokenizer(x))),
    ('model', DecisionTreeClassifier(
        criterion='entropy',  # Выберите нужный критерий (gini или entropy)
        max_depth=5,       # Настройте максимальную глубину
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=None,
        splitter='best',
        min_impurity_decrease=0.0
    ))
]
)

In [73]:
model_pipeline.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002711CCC38B0>)),
                ('model',
                 DecisionTreeClassifier(criterion='entropy', max_depth=5))])

In [74]:
print(classification_report(y_test, model_pipeline.predict(x_test)))

                                                             precision    recall  f1-score   support

                                                    Forward       0.67      0.27      0.38        15
                                           Вопрос по данным       0.23      0.92      0.37        12
                                           Вопрос по отчету       0.00      0.00      0.00        15
                                      Вопрос по план/фактам       0.33      0.11      0.17         9
                                  Вопрос по составу команды       0.00      0.00      0.00         5
                                                   Выгрузка       0.00      0.00      0.00         1
                                                    Доступы       0.00      0.00      0.00         3
Запрос на корректировку отчета / некорректная работа отчета       0.20      0.18      0.19        11

                                                   accuracy                           0.2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
cv_result =  cross_val_score(model_pipeline, x_train, y_train, cv=5 )

In [76]:
print(cv_result)

[0.24242424 0.27272727 0.12121212 0.18181818 0.25      ]


In [77]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3,4,5,6,7,8],
    'min_samples_split' : [2],
    'min_samples_leaf' : [1]
}

# Создайте конвейер с Grid Search для модели SVC
grid_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenizer(x))),
    ("model",
     GridSearchCV(
        DecisionTreeClassifier(),
        param_grid=param_grid,
        cv=5,
        verbose=4
        )
    )
])

In [78]:
grid_pipeline.fit(x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.242 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.242 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.182 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.212 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.281 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=4, min_samples_leaf=1, min_samples_split=2;, score=0.273 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=4, min_samples_leaf=1, min_samples_split=2;, score=0.242 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=4, min_samples_leaf=1, min_samples_split=2;, score=0.152 total time=   0.0s
[CV 4/5] END criter

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x0000027120204310>)),
                ('model',
                 GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
                              param_grid={'criterion': ['gini', 'entropy'],
                                          'max_depth': [3, 4, 5, 6, 7, 8],
                                          'min_samples_leaf': [1],
                                          'min_samples_split': [2]},
                              verbose=4))])