In [90]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")   

---
### Произведем объединение первичных данных

In [91]:
# Загрузка данных
def load_data():
    customers = pd.read_csv("./clean_data/customers.csv")
    geolocation = pd.read_csv("./clean_data/geolocation.csv")
    order_pay = pd.read_csv("./clean_data/order_payments.csv")
    reviews = pd.read_csv("./clean_data/order_reviews.csv")
    orders = pd.read_csv("./clean_data/orders.csv")
    item = pd.read_csv("./clean_data/orders_items.csv")
    category_name = pd.read_csv(
        "./clean_data/product_category_name_translation.csv")
    products = pd.read_csv("./clean_data/products.csv")
    sellers = pd.read_csv("./clean_data/sellers.csv")
    return customers, geolocation, order_pay, reviews, orders, item, category_name, products, sellers


In [92]:
# Объединение данных
def merge_data(orders, item, order_pay, reviews, products, customers, sellers, category_name):
    df = orders.merge(item, on='order_id', how='left')
    df = df.merge(order_pay, on='order_id', how='outer', validate='m:m')
    df = df.merge(reviews, on='order_id', how='outer')
    df = df.merge(products, on='product_id', how='outer')
    df = df.merge(customers, on='customer_id', how='outer')
    df = df.merge(sellers, on='seller_id', how='outer')
    df = df.merge(category_name, on="product_category_name", how="left")
    return df

In [93]:
# Очистка данных: удаление строк без customer_unique_id
def filter_customers(df):
    return df[~df["customer_unique_id"].isna()]

In [94]:
# Главная функция пайплайна
def main_pipeline():
    # Шаг 1: Загрузка данных
    customers, geolocation, order_pay, reviews, orders, item, category_name, products, sellers = load_data()

    # Шаг 2: Объединение данных
    df = merge_data(orders, item, order_pay, reviews, products,
                    customers, sellers, category_name)

    # Шаг 3: Фильтрация данных
    df = filter_customers(df)

    final_data = df

    return final_data

In [95]:
data = main_pipeline()

In [96]:
labels = pd.read_csv("./labels/rfm.csv")
labels.head()

Unnamed: 0,customer_unique_id,R_rank,F_rank,M_rank,RFM_Weighted,Churn_Risk
0,0000366f3b9a7992bf8c76cfdf3221e2,4,1,1,2.5,2
1,0000b849f77a49e4a4ce2b2a4ca5be3f,3,1,1,2.0,2
2,0000f46a3911fa3c0805444483337064,1,1,1,1.0,3
3,0000f6ccb0745a6a4b88665a16c9f078,2,1,1,1.5,3
4,0004aac84e0df4da2b147fca70cf8255,2,1,1,1.5,3


In [97]:
labels = labels[["customer_unique_id", "Churn_Risk"]]
labels.head()

Unnamed: 0,customer_unique_id,Churn_Risk
0,0000366f3b9a7992bf8c76cfdf3221e2,2
1,0000b849f77a49e4a4ce2b2a4ca5be3f,2
2,0000f46a3911fa3c0805444483337064,3
3,0000f6ccb0745a6a4b88665a16c9f078,3
4,0004aac84e0df4da2b147fca70cf8255,3


---
### Сформируем портрет клиента на основе подхода который мы использовали для KMeans, но с учетом тех фитов которые мы можем поймать даже при первой покупке клиента, т.е. группа товаров, город, цена товара, оценка товара, что-то, что позволит нам идентифицировать клиента гораздо заранее с определенной долей вероятности

In [98]:
# Сформируем информацию по количеству покупок
orderer_by_client = data.groupby(["customer_unique_id",])[
    ["order_id"]].count().reset_index().rename(columns={"order_id":"num_orders"})

orderer_by_client

Unnamed: 0,customer_unique_id,num_orders
0,0000366f3b9a7992bf8c76cfdf3221e2,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1
2,0000f46a3911fa3c0805444483337064,1
3,0000f6ccb0745a6a4b88665a16c9f078,1
4,0004aac84e0df4da2b147fca70cf8255,1
...,...,...
96091,fffcf5a5ff07b0908bd4e2dbc735a684,2
96092,fffea47cd6d3cc0a88bd621562a9d061,1
96093,ffff371b4d645b6ecea244b27531430a,1
96094,ffff5962728ec6157033ef9805bacc48,1


In [99]:
clients_city = data[["customer_unique_id", "customer_city"]]
clients_city = clients_city.drop_duplicates()
clients_city

Unnamed: 0,customer_unique_id,customer_city
0,3c7e305796add66698959fc7ad176f6b,umuarama
1,9de5797cddb92598755a0f76383ddbbb,entre rios de minas
2,9915eb9f74b6c11aaf04833f65b00e93,paracatu
3,dce323533e45e74d215e0fe7fb114118,sao goncalo
5,acfca8c3549ceceba9e125afc0349610,curitiba
...,...,...
118805,1479d41bbd302e37d1316c996c1f55ae,uberlandia
118806,587b326ba3bf8aa4d3e50fb1f38ea79f,cotia
118808,1942b890cee1b55dbf8176e925e79e07,porto alegre
118809,0f21adf44f13a61282678a89f6433c10,salvador


In [100]:
payments_ratings = data.groupby("customer_unique_id")[
    ["payment_value", "payment_installments", "price", "review_score"]].agg(["mean"]).reset_index()
payments_ratings.columns = ['_'.join(col).strip('_') for col in payments_ratings.columns.values]
payments_ratings

Unnamed: 0,customer_unique_id,payment_value_mean,payment_installments_mean,price_mean,review_score_mean
0,0000366f3b9a7992bf8c76cfdf3221e2,141.90,8.0,129.90,5.0
1,0000b849f77a49e4a4ce2b2a4ca5be3f,27.19,1.0,18.90,4.0
2,0000f46a3911fa3c0805444483337064,86.22,8.0,69.00,3.0
3,0000f6ccb0745a6a4b88665a16c9f078,43.62,4.0,25.99,4.0
4,0004aac84e0df4da2b147fca70cf8255,196.89,6.0,180.00,5.0
...,...,...,...,...,...
96091,fffcf5a5ff07b0908bd4e2dbc735a684,2067.42,10.0,785.00,5.0
96092,fffea47cd6d3cc0a88bd621562a9d061,84.58,1.0,64.89,4.0
96093,ffff371b4d645b6ecea244b27531430a,112.46,1.0,89.90,5.0
96094,ffff5962728ec6157033ef9805bacc48,133.69,5.0,115.00,5.0


In [101]:
# Сформируем информацию по модальной категории продуктов в разрезе пользователей
def calculate_mode(series):
    return series.mode().iloc[0] if not series.mode().empty else None

mode_category = (
    data.groupby("customer_unique_id")["product_category_name_english"]
    .agg(calculate_mode)
    .reset_index()
)

mode_category.columns = ["customer_unique_id", "most_frequent_product_category"]

mode_category


Unnamed: 0,customer_unique_id,most_frequent_product_category
0,0000366f3b9a7992bf8c76cfdf3221e2,bed_bath_table
1,0000b849f77a49e4a4ce2b2a4ca5be3f,health_beauty
2,0000f46a3911fa3c0805444483337064,stationery
3,0000f6ccb0745a6a4b88665a16c9f078,telephony
4,0004aac84e0df4da2b147fca70cf8255,telephony
...,...,...
96091,fffcf5a5ff07b0908bd4e2dbc735a684,health_beauty
96092,fffea47cd6d3cc0a88bd621562a9d061,baby
96093,ffff371b4d645b6ecea244b27531430a,auto
96094,ffff5962728ec6157033ef9805bacc48,watches_gifts


In [102]:
clients_data_1 = orderer_by_client.merge(
    payments_ratings, on="customer_unique_id")
clients_data_2 = clients_city.merge(mode_category, on="customer_unique_id")
clients_data = clients_data_1.merge(
    clients_data_2, on="customer_unique_id")

# ОбЪединяем с лейблами
clients_data_labeled = labels.merge(clients_data, on="customer_unique_id")
clients_data_labeled.head()

Unnamed: 0,customer_unique_id,Churn_Risk,num_orders,payment_value_mean,payment_installments_mean,price_mean,review_score_mean,customer_city,most_frequent_product_category
0,0000366f3b9a7992bf8c76cfdf3221e2,2,1,141.9,8.0,129.9,5.0,cajamar,bed_bath_table
1,0000b849f77a49e4a4ce2b2a4ca5be3f,2,1,27.19,1.0,18.9,4.0,osasco,health_beauty
2,0000f46a3911fa3c0805444483337064,3,1,86.22,8.0,69.0,3.0,sao jose,stationery
3,0000f6ccb0745a6a4b88665a16c9f078,3,1,43.62,4.0,25.99,4.0,belem,telephony
4,0004aac84e0df4da2b147fca70cf8255,3,1,196.89,6.0,180.0,5.0,sorocaba,telephony


In [126]:
# Кодируем и центрируем что нужно
label_encoder_city = LabelEncoder()
label_encoder_cat = LabelEncoder()
scaler = StandardScaler()

num_columns = ["num_orders", "payment_value_mean", "payment_installments_mean",
               "price_mean", "review_score_mean"]
cat_columns = ["customer_city", "most_frequent_product_category"]


# Разделение на X и y
X = clients_data_labeled[num_columns + cat_columns]
y = clients_data_labeled["Churn_Risk"]

X_scaled = scaler.fit_transform(X[num_columns])
X_city = label_encoder_city.fit_transform(X["customer_city"])
X_category = label_encoder_cat.fit_transform(
    X["most_frequent_product_category"])

X_joined = pd.concat([pd.DataFrame(X_scaled, columns=num_columns), pd.DataFrame(
    X_city, columns=["customer_city"]), pd.DataFrame(X_category, columns=["most_frequent_product_category"])], axis=1)
X_joined

Unnamed: 0,num_orders,payment_value_mean,payment_installments_mean,price_mean,review_score_mean,customer_city,most_frequent_product_category
0,-0.238175,-0.073232,1.904613,0.021411,0.661303,655,7
1,-0.238175,-0.601906,-0.710186,-0.561384,-0.119666,2594,43
2,-0.238175,-0.329849,1.904613,-0.298339,-0.900635,3520,66
3,-0.238175,-0.526184,0.410442,-0.524159,-0.119666,448,68
4,-0.238175,0.180205,1.157528,0.284456,0.661303,3758,68
...,...,...,...,...,...,...,...
96214,0.918720,8.801079,2.651699,3.460948,0.661303,3255,43
96215,-0.238175,-0.337408,-0.710186,-0.319918,-0.119666,1324,6
96216,-0.238175,-0.208915,-0.710186,-0.188605,0.661303,3740,5
96217,-0.238175,-0.111070,0.783985,-0.056820,0.661303,516,70


In [127]:
y.value_counts()

Churn_Risk
2    46231
3    42189
1     7799
Name: count, dtype: int64

In [128]:
X_train, X_test, y_train, y_test = train_test_split(
    X_joined, y, test_size=0.18, stratify=y, random_state=42)

---
### Пробуем лес как базу

In [129]:
rf_classif = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_classif.fit(X_train, y_train)

y_pred = rf_classif.predict(X_test)
y_test_pred = rf_classif.predict(X_train)

In [130]:
accuracy_score(y_test, y_pred)

0.6733256351039261

In [131]:
accuracy_score(y_train, y_test_pred)

0.9976172068087048

> Лес переобучился

---

In [132]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 1. Определение архитектуры нейронной сети
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleClassifier, self).__init__()
        # Первый полносвязный слой
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)  # Добавляем Dropout для регуляризации

        # Второй полносвязный слой
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)  # Еще один Dropout

        # Третий полносвязный слой
        self.fc3 = nn.Linear(hidden_size // 2, hidden_size // 4)
        self.relu3 = nn.ReLU()

        # Четвертый полносвязный слой (выходной)
        self.fc4 = nn.Linear(hidden_size // 4, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.dropout1(out)

        out = self.fc2(out)
        out = self.relu2(out)
        out = self.dropout2(out)

        out = self.fc3(out)
        out = self.relu3(out)

        out = self.fc4(out)
        return out

In [133]:
# 2. Подготовка данных
input_size = X_joined.shape[1]  # Размер входных данны
hidden_size = 16  # Увеличиваем количество нейронов в скрытом слое
num_classes = 3  # Количество классов для классификации

In [134]:
# Преобразуем данные в тензоры
X_train_tensor = torch.tensor(
    X_train.values, dtype=torch.float32)  # Преобразуем в float32
# Метки классов должны быть целочисленными (long)
y_train_tensor = torch.tensor(y_train.values).long()

# Создание Dataset и DataLoader
dataset = TensorDataset(X_train_tensor, y_train_tensor)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# 3. Перевод модели и данных на устройство mps
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SimpleClassifier(input_size, hidden_size, num_classes).to(device)

# Инициализация функции потерь и оптимизатора

# Вычисление весов для каждого класса
class_counts = [7799, 46231, 42189]
total_samples = sum(class_counts)
weights = torch.tensor([total_samples / count for count in class_counts])

# Нормализация весов
weights = weights / weights.sum()
weights = torch.tensor(weights).to(device)

# Использование весов в функции потерь
criterion = nn.CrossEntropyLoss(weight=weights)

optimizer = optim.Adam(model.parameters(), lr=0.01)  # Оптимизатор Adam

Using device: mps


In [135]:
# 4. Обучение модели
num_epochs = 5
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        # Перемещение батча на устройство
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass и оптимизация
        optimizer.zero_grad()  # Обнуление градиентов
        loss.backward()  # Вычисление градиентов
        optimizer.step()  # Обновление весов

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/5], Loss: 0.1256
Epoch [2/5], Loss: 0.1260
Epoch [3/5], Loss: 1.0556
Epoch [4/5], Loss: 0.1230
Epoch [5/5], Loss: 0.1571


In [119]:
# Сохранение модели
torch.save(model.state_dict(), "../services/model_classif.pth")
print("Модель успешно сохранена.")

Модель успешно сохранена.


In [120]:
# Загрузка модели
model = SimpleClassifier(input_size, hidden_size, num_classes).to(device)
model.load_state_dict(torch.load("../services/model_classif.pth"))
print("Модель успешно загружена.")

Модель успешно загружена.


In [136]:
# Подготовка тестовых данных
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)

# Предсказание
model.eval()  # Убедитесь, что модель находится в режиме оценки
with torch.no_grad():  # Отключаем вычисление градиентов
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)  # Получаем индексы максимальных значений (классы)

print("Предсказанные классы:", predicted.cpu().numpy())

Предсказанные классы: [2 2 2 ... 2 2 2]


In [137]:
# Истинные метки
y_test_tensor = torch.tensor(y_test.values).long()

# Оценка точности
accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.4805


In [138]:
pd.Series(predicted.cpu().numpy()).value_counts()

2    17320
Name: count, dtype: int64