# Классификатор купит / не купит в течение трех дней

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

# Загрузка данных
def load_data():
    catalog = pd.read_parquet('stokman_catalog_preprocessed.pq')
    actions = pd.read_parquet('train_actions.pq')
    vector_mapping = pd.read_parquet('catalog_vector_mapping.pq')
    vectors = np.load('vectors.npz')['arr_0']  # Извлечение эмбеддингов товаров

    return catalog, actions, vector_mapping, vectors

# Предобработка данных
def preprocess_data(actions, catalog, vector_mapping):
    # Преобразование даты
    actions['date'] = pd.to_datetime(actions['date'])
    
    # Присоединение каталога товаров
    actions = actions.explode('products')  # Распаковка массива products
    actions = actions.rename(columns={'products': 'product_id'})
    actions = actions.merge(catalog[['product_id', 'price', 'category_id']], on='product_id', how='left')
    
    # Присоединение векторов товаров
    actions = actions.merge(vector_mapping, on='product_id', how='left')
    
    return actions

if __name__ == "__main__":
    catalog, actions, vector_mapping, vectors = load_data()
    actions = preprocess_data(actions, catalog, vector_mapping)
    actions.to_parquet('actions_preprocessed.pq')

In [2]:
actions.head(5)

Unnamed: 0,user_id,loc_user_id,action,date,product_id,pageId,price,category_id,vector_id
0,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:04,,2571824865,,,
1,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:08,,3834364438,,,
2,f9c498ec-5d3b-11ef-86e0-002590c0647c,120c9064-1131-4dc3-8048-44184531b42e,7,2024-09-07 00:00:08,,2448628415,,,
3,59386b5c-e64f-11ec-8086-002590c82437,f0745572-893f-4e50-bc52-5af47badff5a,7,2024-09-07 00:00:11,,3875013967,,,
4,badbd396-6cab-11ef-86e0-002590c0647c,73423d85-d47c-4332-8155-5200615302b5,7,2024-09-07 00:00:11,,3025531174,,,


In [3]:
import pandas as pd

# Генерация временных и взаимодействующих признаков
def generate_features(actions):
    # Время с последнего действия для каждого пользователя
    actions['days_since_last_action'] = actions.groupby('user_id')['date'].diff().dt.days
    
    # Признаки активности за последние 3 дня
    def count_recent_actions(df, days):
        recent = df[df['date'] >= df['date'].max() - pd.Timedelta(days=days)]
        return recent.groupby('user_id')['action'].count()
    
    recent_activity_3d = count_recent_actions(actions, 3)
    
    # Количество покупок, добавлений в корзину и просмотров
    agg_features = actions.groupby('user_id').agg({
        'product_id': 'nunique',  # Количество уникальных товаров
        'price': 'mean',          # Средняя цена товаров
        'action': ['count', lambda x: (x == 5).sum()],  # Количество действий и покупок
        'days_since_last_action': 'min'  # Время с последнего действия
    }).reset_index()
    
    agg_features.columns = ['user_id', 'n_unique_products', 'avg_price', 'n_actions', 'n_orders', 'days_since_last_action']
    
    # Объединение с активностью за 3 дня
    agg_features = agg_features.merge(recent_activity_3d, on='user_id', how='left')
    agg_features = agg_features.rename(columns={'action': 'actions_last_3d'})
    
    return agg_features

if __name__ == "__main__":
    actions = pd.read_parquet('actions_preprocessed.pq')
    user_features = generate_features(actions)
    
    user_features.to_parquet('user_features.pq')

In [4]:
user_features.head(3)

Unnamed: 0,user_id,n_unique_products,avg_price,n_actions,n_orders,days_since_last_action,actions_last_3d
0,/*,6,16620.0,39,0,0.0,
1,0000bdba-5180-11eb-8a53-0cc47a6d2fef,3,13365.777778,29,0,0.0,16.0
2,0000d5dc-78cf-11ef-86e0-002590c0647c,0,,2,0,0.0,


In [5]:
#Сгенерированные фичи для предсказания покупки:
#Средняя цена интересующих товаров - avg_price
#Время с последней активности - days_since_last_action
#Общее количество действий - n_actions
#Количество покупок - n_orders
#Количество уникальных товаров - n_unique_products
#Действия за последние три дня - actions_last_3d

In [6]:
import pandas as pd

#Предобработка отсутствующих значений
def null_data_preprocessing(user_features):
    user_features.fillna(0)
    user_features.to_parquet('user_features.pq')

In [13]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score
# import mlflow
# import mlflow.lightgbm

# Загрузка данных
def load_features_data():
    features = pd.read_parquet('user_features.pq')
    return features

# Обучение модели
def train_model(features):
    # Целевая переменная: наличие заказов
    features = features.drop('user_id', axis=1)
    
    X = features.drop(columns=['n_orders'])  # Признаки
    y = (features['n_orders'] > 0).astype(int)  # Купил ли товар
    
    # Разделение на тренировочные и тестовые данные
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test)
    
    # Параметры модели
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'feature_fraction': 0.9
    }
    
    # # Логирование через MLFlow
    # mlflow.lightgbm.autolog()
    
    # with mlflow.start_run():
    model = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=100)
    
    # Предсказания и метрики
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    # mlflow.log_metric("precision", precision)
    # mlflow.log_metric("recall", recall)

    print(classification_report(y_test, y_pred))
    print(f'Wirh accuracy: {accuracy_score(y_test, y_pred)}')
    
    return model

if __name__ == "__main__":
    features = load_features_data()
    null_data_preprocessing(features)
    features = load_features_data()

    
    model = train_model(features)
    model.save_model('lgb_model.txt')

[LightGBM] [Info] Number of positive: 2654, number of negative: 372894
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 375548, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.007067 -> initscore=-4.945226
[LightGBM] [Info] Start training from score -4.945226
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     93250
           1       0.51      0.20      0.29       637

    accuracy                           0.99     93887
   macro avg       0.75      0.60      0.64     93887
weighted avg       0.99      0.99      0.99     93887

Wirh accuracy: 0.9932685036267002


# Модель ранжирования

In [17]:
# Загрузка данных
def load_data():
    catalog = pd.read_parquet('stokman_catalog_preprocessed.pq')
    actions = pd.read_parquet('train_actions.pq')
    catalog_mapping  = pd.read_parquet('catalog_vector_mapping.pq')
    
    vectors = np.load('vectors.npz')['arr_0']  # Извлечение эмбеддингов товаров как DataFrame
    vectors = pd.DataFrame(vectors)
    
    return catalog, actions, catalog_mapping , vectors

def preprocess_data(catalog, actions, catalog_mapping , vectors):
    actions['target'] = actions['action'].apply(lambda x: 1 if x == 5 else 0)  # Действие 5 — это заказ
    return actions

if __name__ == "__main__":
    catalog, actions, catalog_mapping, vectors = load_data()
    actions = preprocess_data(catalog, actions, catalog_mapping, vectors)
    actions.to_parquet('actions_preprocessed_ranker.pq')

In [18]:
# Объединение данных о товарах и действиях пользователей
def union(catalog, actions, catalog_mapping , vectors):
    # Соединим эмбеддинги векторов товары и информацию о товарах
    catalog_mapping.set_index('vector_id')
    catalog_mapping = catalog_mapping.merge(vectors, left_index=True, right_index=True)
    catalog_mapping = catalog_mapping.drop('vector_id', axis=1)
    
    # Преобразуем массивы product в train_actions для дальнейшей обработки
    actions = actions.explode('products')
    
    # Присоединим к действиям информацию о товарах
    actions = actions.merge(catalog, left_on='products', right_on='product_id', how='left')

    return actions, catalog_mapping

if __name__ == "__main__":
    actions, catalog_mapping = union(catalog, actions, catalog_mapping, vectors)
    actions.to_parquet('actions_preprocessed_ranker.pq')

In [55]:
# Генерация признаков для каждого пользователя и товара
def future_generation(catalog, actions, catalog_mapping , vectors):
    user_product_features = actions.groupby(['user_id', 'product_id']).agg({
        'action': 'count',  # Количество уникальных действий
        'price': 'mean',  # Средняя цена продукта
        'category_id': 'first'  # Категория продукта
    }).reset_index()
    
    # Примерная генерация фичей на основе действий:
    user_features = actions.groupby('user_id').agg({
        'action': ['count', lambda x: (x == 5).sum()],  # Количество действий и покупок
        'products': 'nunique'  # Количество уникальных просмотренных товаров
    }).reset_index()
    user_features.columns = ['user_id', 'count', 'n_orders', 'nunique']

    train_data = user_product_features.merge(user_features, on='user_id', how='left')

    return train_data

if __name__ == "__main__":
     train_data = future_generation(catalog, actions, catalog_mapping, vectors)
     train_data.to_parquet('ranker_train_data.pq')

In [56]:
train_data = pd.read_parquet('ranker_train_data.pq')
train_data

Unnamed: 0,user_id,product_id,action,price,category_id,count,n_orders,nunique
0,/*,5007406,1,15390.0,1149,39,0,6
1,/*,5342791,1,12590.0,1149,39,0,6
2,/*,5342794,1,7790.0,1149,39,0,6
3,/*,5342797,1,9090.0,1149,39,0,6
4,/*,6778275,2,31290.0,50448,39,0,6
...,...,...,...,...,...,...,...,...
710186,ffff5c80-b082-11ee-86e0-002590c0647c,5460757,2,49990.0,2176,87,0,9
710187,ffff5c80-b082-11ee-86e0-002590c0647c,7067589,1,42600.0,56565,87,0,9
710188,ffff7a16-17f8-11ef-86e0-002590c0647c,867973,1,9020.0,14603,3,0,1
710189,fffff5f6-75e5-11ed-a1c4-002590c82437,1677299,1,3740.0,2094,7,0,2


In [58]:
def user_features_merge(train_data):
    # Указываем целевую переменную (покупка) и признаки
    X = train_data.drop(columns=['n_orders'])
    y = (train_data['n_orders'] > 0).astype(int)
    
    # Разделение данных на обучение и тест
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_val, y_train, y_val
if __name__ == "__main__":
     train_data = pd.read_parquet('ranker_train_data.pq')
     X_train, X_val, y_train, y_val = user_features_merge(train_data)

In [60]:
X_train, X_val, y_train, y_val = user_features_merge(train_data)