In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download("stopwords")
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuriy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuriy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
input_data = pd.read_csv(r"C:\Users\yuriy\Downloads\ecommerce_data_set.csv", sep=",", header=None)

In [3]:
input_data.columns = ['class_type', 'naming']
input_data = input_data.dropna(subset=['naming'])
input_data.head(10)

Unnamed: 0,class_type,naming
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
5,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
6,Household,Paper Plane Design Starry Night Vangoh Wall Ar...
7,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
8,Household,SAF 'Ganesh Modern Art Print' Painting (Synthe...
9,Household,Paintings Villa UV Textured Modern Art Print F...


In [4]:
x_train, x_test, y_train, y_test = train_test_split(input_data['naming'], input_data['class_type'], test_size=0.3, random_state=1)

In [5]:
def tokenizer(sentence: str, lang: str = 'english'):
    stop_words = stopwords.words(lang)
    snowball = SnowballStemmer(language=lang)
    tokens = word_tokenize(sentence, language=lang)
    tokens = [i for i in tokens if i not in string.punctuation]
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [7]:
model_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda x: tokenizer(x))),
    ('model', svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto'))
]
)

In [8]:
model_pipeline.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002388131EF70>)),
                ('model', SVC(gamma='auto', kernel='linear'))])

In [9]:
model_pipeline.predict(['Asus tuf gaming motherboard'])

array(['Electronics'], dtype=object)

In [10]:
precision_score(y_true=y_test, y_pred=model_pipeline.predict(x_test), average='micro', pos_label='positive')



0.9742199894235855

In [11]:
print(classification_report(y_test, model_pipeline.predict(x_test)))

                        precision    recall  f1-score   support

                 Books       0.98      0.96      0.97      3460
Clothing & Accessories       0.98      0.98      0.98      2562
           Electronics       0.97      0.96      0.97      3168
             Household       0.97      0.98      0.98      5938

              accuracy                           0.97     15128
             macro avg       0.98      0.97      0.97     15128
          weighted avg       0.97      0.97      0.97     15128



In [12]:
cv_result =  cross_val_score(model_pipeline, x_train, y_train, cv=5 )

In [13]:
print(cv_result)

[0.97422096 0.97407565 0.97138405 0.97294234 0.96883411]


### Обучаем модель на наших данных

In [14]:
input_data2 = pd.read_csv(r"C:\Users\yuriy\Downloads\duty_stats_raw_data.csv", sep=",")

In [15]:
input_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          235 non-null    int64 
 1   user_name      233 non-null    object
 2   user_group_id  235 non-null    object
 3   message        235 non-null    object
 4   Task type      235 non-null    object
 5   Task subtype   235 non-null    object
 6   Resolution     235 non-null    object
 7   event_date     235 non-null    object
 8   task_status    235 non-null    object
dtypes: int64(1), object(8)
memory usage: 16.6+ KB


In [16]:
input_data2.head(10)

Unnamed: 0,index,user_name,user_group_id,message,Task type,Task subtype,Resolution,event_date,task_status
0,241,rostepanov,ASD Project,Привет!\n в отчете [Аквизишн Овервью](https://...,Вопрос по данным,Вопрос по логике отчета,Ответ,2023-06-19,done
1,408,epkuznetsova,ASD Project,Привет!\n Сейчас тестируем в команде Хантеры c...,Запрос на корректировку отчета / некорректная ...,Вопрос о возможности доработки отчета,Доработка не реальна на момент обращения,2023-08-17,done
2,237,assementsov,Analytic,"Гриша, привет\n подскажи пожалуйста, считаем л...",Вопрос по данным,Вопрос по данным битрикса,Ответ,2023-06-16,done
3,390,rostepanov,ASD Project,"Коллеги, привет!\n Отчет [Аквизишн овервью](ht...",Вопрос по данным,Вопрос по логике отчета,Ответ,2023-08-14,done
4,240,aadenisova,Analytic,"Коллеги, привет! \n Подскажите, где в витринах...",Вопрос по данным,Вопрос по логике витрины,Ответ,2023-06-16,done
5,300,gvponomarev,Analytic,"@iashmelev , @inkaraivanova\n \n > У нас с Иро...",Вопрос по данным,Вопрос по логике витрины,Ответ,2023-07-10,done
6,281,vagarkavaya,Analytic,Привет! Помогите пожалуйста разобраться с флаг...,Вопрос по данным,Вопрос по логике витрины,Ответ,2023-06-30,done
7,434,aazakharov,ASD KAM,"Коллеги, привет.\n У нашего клиента - [6168966...",Вопрос по отчету,Вопрос по логике отчета,Ответ,2023-08-28,done
8,298,asazarin,ASD KAM,"Коллеги, доброе утро.\n \n Сводный отчёт - htt...",Вопрос по отчету,Вопрос по логике отчета,Ответ,2023-07-10,done
9,372,vvdergilev,Analytic,Привет! Подскажите откуда лучше всего взять те...,Вопрос по данным,Вопрос по наличию данных,Ответ,2023-08-07,done


In [17]:
input_data2 = input_data2[['Task type', 'message']]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(input_data2['message'], input_data2['Task type'], test_size=0.3, random_state=1)

In [68]:
model_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda x: tokenizer(x))),
    ('model', svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma=10))
]
)

In [69]:
model_pipeline.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002388341D430>)),
                ('model', SVC(gamma=10, kernel='sigmoid'))])

In [70]:
print(classification_report(y_test, model_pipeline.predict(x_test)))

                                                             precision    recall  f1-score   support

                                                    Forward       0.21      0.33      0.26        15
                                           Вопрос по данным       0.41      0.58      0.48        12
                                           Вопрос по отчету       0.50      0.13      0.21        15
                                      Вопрос по план/фактам       0.33      0.33      0.33         9
                                  Вопрос по составу команды       0.67      0.40      0.50         5
                                                   Выгрузка       0.00      0.00      0.00         1
                                                    Доступы       0.12      0.33      0.18         3
Запрос на корректировку отчета / некорректная работа отчета       0.17      0.09      0.12        11

                                                   accuracy                           0.3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [71]:
cv_result =  cross_val_score(model_pipeline, x_train, y_train, cv=5 )

In [72]:
print(cv_result)

[0.36363636 0.33333333 0.36363636 0.24242424 0.3125    ]


In [66]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [0.1, 1, 10]
}

# Создайте конвейер с Grid Search для модели SVC
grid_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenizer(x))),
    ("model",
     GridSearchCV(
        svm.SVC(),
        param_grid=param_grid,
        cv=3,
        verbose=4
        )
    )
])

In [67]:
grid_pipeline.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.200 total time=   0.0s
[CV 2/3] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.200 total time=   0.0s
[CV 3/3] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.204 total time=   0.0s
[CV 1/3] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.200 total time=   0.0s
[CV 2/3] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.200 total time=   0.0s
[CV 3/3] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.204 total time=   0.0s
[CV 1/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.200 total time=   0.0s
[CV 2/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.200 total time=   0.0s
[CV 3/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.204 total time=   0.0s
[CV 1/3] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.200 total time=   0.0s
[CV 2/3] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.200 total time=   0.0s
[CV 3/3] END ..C=0.1, gamma=0.1, kernel=sigmoid

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002388341D160>)),
                ('model',
                 GridSearchCV(cv=3, estimator=SVC(),
                              param_grid={'C': [0.1, 1, 10],
                                          'gamma': [0.1, 1, 10],
                                          'kernel': ['linear', 'poly', 'rbf',
                                                     'sigmoid']},
                              verbose=4))])