In [1]:
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt


In [18]:
from joblib import Memory

# Считываем данные

In [2]:
df = pd.read_csv('pricerunner_aggregate.csv')
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
data = df[['product_title', 'merchant_id', 'cluster_label','category_id']]

In [20]:
data.columns = data.columns.str.strip()

In [21]:
# Check for missing values in each column
missing_values = df.isnull().sum()
missing_values

product_id        0
product_title     0
merchant_id       0
cluster_id        0
cluster_label     0
category_id       0
category_label    0
dtype: int64

In [22]:
data.head(5)

Unnamed: 0,product_title,merchant_id,cluster_label,category_id
0,apple iphone 8 plus 64gb silver,1,Apple iPhone 8 Plus 64GB,2612
1,apple iphone 8 plus 64 gb spacegrau,2,Apple iPhone 8 Plus 64GB,2612
2,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,Apple iPhone 8 Plus 64GB,2612
3,apple iphone 8 plus 64gb space grey,4,Apple iPhone 8 Plus 64GB,2612
4,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,Apple iPhone 8 Plus 64GB,2612


In [28]:
data['merchant_id'].value_counts()

3      2547
6      1591
298    1523
31     1350
119    1239
       ... 
37        1
200       1
296       1
127       1
371       1
Name: merchant_id, Length: 306, dtype: int64

# Обучаем модель catboost

In [23]:
text_features = ['product_title', 'cluster_label']

# Замените 'целевая_переменная' на название вашей целевой переменной
X = data.drop('category_id', axis=1)
y = data['category_id']

# Разделение на тренировочный и тестовый наборы данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание объекта Pool с текстовыми признаками
train_pool = Pool(data=X_train, label=y_train, text_features=text_features)
test_pool = Pool(data=X_test, label=y_test, text_features=text_features)

# Создание модели CatBoost
model_cat_boost = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='MultiClass', random_seed=42)

# Обучение модели
model_cat_boost.fit(train_pool, eval_set=test_pool, verbose=50)

# Расчет различных метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Test Accuracy: {accuracy}')
print(f'Test Precision: {precision}')
print(f'Test Recall: {recall}')
print(f'Test F1 Score: {f1}')

0:	learn: 1.9294575	test: 1.9245807	best: 1.9245807 (0)	total: 16.5s	remaining: 2h 17m 4s
50:	learn: 0.2052650	test: 0.1922149	best: 0.1922149 (50)	total: 15m 39s	remaining: 2h 17m 52s
100:	learn: 0.1148578	test: 0.1117710	best: 0.1117710 (100)	total: 31m 33s	remaining: 2h 4m 40s
150:	learn: 0.0952440	test: 0.0957286	best: 0.0957286 (150)	total: 46m 12s	remaining: 1h 46m 48s
200:	learn: 0.0857362	test: 0.0890736	best: 0.0890736 (200)	total: 1h 2m 16s	remaining: 1h 32m 37s
250:	learn: 0.0783711	test: 0.0843476	best: 0.0843476 (250)	total: 1h 15m 38s	remaining: 1h 15m 2s
300:	learn: 0.0727188	test: 0.0816074	best: 0.0816074 (300)	total: 1h 27m 40s	remaining: 57m 57s
350:	learn: 0.0679188	test: 0.0798171	best: 0.0798145 (349)	total: 1h 40m 9s	remaining: 42m 31s
400:	learn: 0.0644566	test: 0.0784980	best: 0.0784980 (400)	total: 1h 52m 46s	remaining: 27m 50s
450:	learn: 0.0613968	test: 0.0776256	best: 0.0776256 (450)	total: 2h 5m 27s	remaining: 13m 37s
499:	learn: 0.0588266	test: 0.0769733	

In [24]:
# Save the model to a file
with open('model_cat_boost.pkl', 'wb') as model_file:
    pickle.dump(model_cat_boost, model_file)

# Преподготовка данных для других моделей

In [3]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

Предобработка текстовых данных


In [6]:
# скачиваем стопслова и пунктуацию из nltk
nltk.download('punkt')
# nltk.download('stopwords')

# Load stopwords from NLTK
# stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # переводим текст в нижний регистр
    text = text.lower()
    # удаляем сто слова
    # text = ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words])
    # удаляем доп знаки
    text = re.sub(r'\b\d+\b', '', text)
    return text

# создаем копию датафрэйма
data_processed = data.copy()

# добаялвем изменения к текстовым колонкам
data_processed['product_title'] = data_processed['product_title'].apply(preprocess_text)
data_processed['cluster_label'] = data_processed['cluster_label'].apply(preprocess_text)
data_processed


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


Unnamed: 0,product_title,merchant_id,cluster_label,category_id
0,apple iphone plus 64gb silver,1,apple iphone plus 64gb,2612
1,apple iphone plus gb spacegrau,2,apple iphone plus 64gb,2612
2,apple mq8n2b/a iphone plus 64gb . 12mp sim fr...,3,apple iphone plus 64gb,2612
3,apple iphone plus 64gb space grey,4,apple iphone plus 64gb,2612
4,apple iphone plus gold . 64gb 4g unlocked sim...,5,apple iphone plus 64gb,2612
...,...,...,...,...
35306,smeg fab28 60cm retro style right hand hinge f...,59,smeg fab28 cream,2623
35307,smeg fab28 60cm retro style left hand hinge fr...,59,smeg fab28 red,2623
35308,smeg fab28 60cm retro style left hand hinge fr...,59,smeg fab28 pink,2623
35309,candy 60cm built under larder fridge cru160nek,125,candy cru16.,2623


In [7]:
text_features = ['product_title', 'cluster_label']
data_processed['text_features'] = data_processed[text_features].agg(' '.join, axis=1)

tfidf_vectorizer = TfidfVectorizer(max_features= 5000)
data_product = tfidf_vectorizer.fit_transform(data_processed['text_features']).toarray()

# tfidf_vectorizer = TfidfVectorizer(max_features= 5000)
# data_cluster_label = tfidf_vectorizer.fit_transform(data_processed['cluster_label']).toarray()

In [11]:
import numpy as np

In [12]:
dummies = pd.get_dummies(data_processed['merchant_id'])

# Объединение данных get_dummies с TF-IDF массивом
# Предполагается, что data_text уже преобразован в массив
combined_data = np.hstack((data_product, dummies))

In [13]:
# определяем целевую переменную
y_label = data_processed['category_id']

In [14]:
# разделяем на трэйн/тест
X_train, X_test, y_train, y_test = train_test_split(
    combined_data, y_label, test_size=0.2, random_state=42
)

# Initialize the Logistic Regression model for text features
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on text features
logreg_model.fit(X_train, y_train)

# Make predictions on the test set for text features
y_pred = logreg_model.predict(X_test)


In [15]:
# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the resultsy_label
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

Precision: 0.9695
Recall: 0.9694
F1-Score: 0.9694


In [60]:
# from joblib import Memory
import pickle

In [22]:
# Save the model to a file
with open('logreg_model.pkl', 'wb') as model_file:
    pickle.dump(logreg_model, model_file)

# Модель SVM


In [16]:
from sklearn.svm import SVC

In [17]:
# Creating and training the SVM model
svm_model = SVC(random_state=42)

# Training the model
svm_model.fit(X_train, y_train)

# Evaluating the model
y_pred = svm_model.predict(X_test)


In [18]:

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

Precision: 0.9637
Recall: 0.9633
F1-Score: 0.9634


In [20]:
import pickle

In [21]:
with open('svm_model.pkl', 'wb') as model_file:
    pickle.dump(svm_model, model_file)