In [None]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import make_moons, load_iris, load_diabetes

import matplotlib.pyplot as plt

import joblib
import torch
import kagglehub
import pickle

In [5]:
def train_test_plot(X_train, y_train, X_test, y_test, title1="Train", title2="Test"):
    plt.figure(figsize=(16, 7))

    ax = plt.subplot(121)
    plt.title(title1)
    plt.xlabel("$x_1$")
    plt.ylabel("$x_2$")
    _ = plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap="Set3", ec='black', alpha=0.85, lw=0.5)

    plt.subplot(122, sharex=ax, sharey=ax)
    plt.title(title2)
    plt.xlabel("$x_1$")
    # Если классификатор предсказал вероятность принадлежности первому классу >= 0.5, считаем объект принадлежащим классу 1
    # в противном случае - 0
    _ = plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap="Set3", ec='black', alpha=0.85, lw=0.5)

def plot_decision_boundary_standart(model, X, y):

    margin_x1 = (X[:, 0].max() - X[:, 0].min()) * 0.1
    margin_x2 = (X[:, 1].max() - X[:, 1].min()) * 0.1

    x_min, x_max = X[:, 0].min() - margin_x1, X[:, 0].max() + margin_x1
    y_min, y_max = X[:, 1].min() - margin_x2, X[:, 1].max() + margin_x2
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 501), np.linspace(y_min, y_max, 501))

    X_to_pred_on = torch.from_numpy(np.column_stack((xx.ravel(), yy.ravel()))).float()

    y_pred = model.predict(X_to_pred_on)

    # Reshape preds and plot
    y_pred = y_pred.reshape(xx.shape)
    plt.contourf(xx, yy, y_pred, cmap="Set3", alpha=0.5)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap="Set3", ec='black', alpha=1, lw=0.5)

    plt.xlabel("$x_0$")
    plt.ylabel("$x_1$")

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

def plot_decision_boundary_net(model: torch.nn.Module, X: torch.Tensor, y: torch.Tensor):
    """Plots decision boundaries of model predicting on X in comparison to y."""
    model.to("cpu")
    X, y = X.to("cpu"), y.to("cpu")

    margin_x1 = (X[:, 0].max() - X[:, 0].min()) * 0.1
    margin_x2 = (X[:, 1].max() - X[:, 1].min()) * 0.1

    x_min, x_max = X[:, 0].min() - margin_x1, X[:, 0].max() + margin_x1
    y_min, y_max = X[:, 1].min() - margin_x2, X[:, 1].max() + margin_x2
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 501), np.linspace(y_min, y_max, 501))

    X_to_pred_on = torch.from_numpy(np.column_stack((xx.ravel(), yy.ravel()))).float()

    model.eval()
    with torch.inference_mode():
        y_logits = model(X_to_pred_on)

    if len(torch.unique(y)) > 2:
        y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
    else:
        y_pred = torch.round(torch.sigmoid(y_logits))

    # Reshape preds and plot
    y_pred = y_pred.reshape(xx.shape).detach().numpy()
    plt.contourf(xx, yy, y_pred, cmap="Set3", alpha=0.5)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap="Set3", ec='black', alpha=1, lw=0.5)

    plt.xlabel("$x_0$")
    plt.ylabel("$x_1$")

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

def train_test_model_plot(model, X_train, y_train, X_test, y_test, net=False):
    plt.figure(figsize=(16, 7))

    plt.subplot(1, 2, 1)
    plt.title("Train")
    if net:
        plot_decision_boundary_net(model, X_train, y_train)
    else:
        plot_decision_boundary_standart(model, X_train, y_train)

    plt.subplot(1, 2, 2)
    plt.title("Test")
    if net:
        plot_decision_boundary_net(model, X_test, y_test)
    else:
        plot_decision_boundary_standart(model, X_test, y_test)

In [6]:
with open("Amazon_products.pkl", 'rb') as f:
    dt = pickle.load(f)

dt

Unnamed: 0,asin,title,price,list_price,rating,reviews,sold_past_month,is_bestseller,is_prime,is_amazon_choice,has_sustainability_features,available_offers,amazon_choice_type,brand,free_delivery_date,fastest_delivery_date
0,B0DJK7NW1J,"15.6 Inch Laptops, Windows 11 Laptop Computers...",199.98,679.99,4.4 out of 5 stars,48,100+,False,False,False,False,,,,,
1,B0BS4BP8FB,Acer Aspire 3 A315-24P-R7VH Slim Laptop | 15.6...,279.99,321.99,4.4 out of 5 stars,39243,8K+,False,False,True,False,6,Overall Pick,,,
2,B0DKDY78K3,"Newest Gaming Laptop, Laptop with AMD Ryzen 7 ...",649.99,1699.99,4.8 out of 5 stars,18,100+,False,False,False,False,,,,,
3,B0CPL25J3W,"HP Portable Laptop, Student and Business, 14"" ...",197.35,269.00,4.1 out of 5 stars,1678,1K+,False,False,False,False,25,,,,
4,B0947BJ67M,"HP 14 Laptop, Intel Celeron N4020, 4 GB RAM, 6...",176.00,209.99,4.0 out of 5 stars,1861,4K+,False,False,False,False,50,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1237,B0B5GRGCX5,"Lenovo L15 Portable Monitor, 15.6” Display, Fu...",162.00,180.44,4.3 out of 5 stars,46,500+,False,False,False,False,28,,,,
1238,B0CH9XW8RK,ARZOPA Portable Monitor 15.6'' FHD 1080P - Ult...,85.99,,4.4 out of 5 stars,1092,3K+,False,True,False,False,4,,,"Tue, Nov 19",
1239,B0CY896H5G,"Laptop Screen Extender 14"" 1080P FHD IPS, Port...",299.99,,4.4 out of 5 stars,422,1K+,False,True,False,False,3,,,"Tue, Nov 19",
1240,B0C77WJ6F5,Yodoit Portable Monitor for Laptop 1920×1080 1...,49.99,69.99,4.0 out of 5 stars,416,1K+,False,True,False,False,,,,"Tue, Nov 19",


In [73]:
X = dt[["price", "list_price", "rating", "reviews", "sold_past_month"]]
y = dt["is_bestseller"]
all_dt = dt[["price", "list_price", "rating", "reviews", "sold_past_month", "is_bestseller"]]
#"is_prime", "is_amazon_choice", "has_sustainability_features"

In [74]:
def str_to_float(s):
    if isinstance(s, str):
        s = float("".join(s.split(',')))
    return s

def stars_to_rating(s):
    if isinstance(s, str):
        return float(s.split()[0])
    return s

def spm(s):
    if isinstance(s, str):
        if s[-2] == "K":
            return int(s[:-2]) * 1000
        return int(s[:-1])
    return s

In [75]:
def upd_dt(X):
    #X["is_bestseller"] = X["is_bestseller"].apply(int)
    #X["is_prime"] = X["is_prime"].apply(int)
    #X["is_amazon_choice"] = X["is_amazon_choice"].apply(int)
    #X["has_sustainability_features"] = X["has_sustainability_features"].apply(int)
    X["price"] = X["price"].apply(str_to_float)
    X['list_price'] = X["list_price"].apply(str_to_float)
    X["rating"] = X['rating'].apply(stars_to_rating)
    X['reviews'] = X['reviews'].apply(str_to_float)
    X['sold_past_month'] = X['sold_past_month'].apply(spm)
    return X

In [76]:
all_dt = upd_dt(all_dt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["price"] = X["price"].apply(str_to_float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['list_price'] = X["list_price"].apply(str_to_float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["rating"] = X['rating'].apply(stars_to_rating)
A value is trying to be set on a copy of a slice from a Da

In [78]:
all_dt1 = all_dt.dropna()
all_dt2 = all_dt.copy()

In [91]:
mean_price = all_dt2['price'].mean()
mean_lprice = all_dt2['list_price'].mean()
mean_rating = all_dt2["rating"].mean()
mean_reviews = all_dt2["reviews"].mean()
mean_spm = all_dt2["sold_past_month"].mean()

all_dt2['price'].interpolate(method='linear', inplace=True)
all_dt2['list_price'].interpolate(method='linear', inplace=True)
all_dt2["rating"].fillna(mean_rating, inplace=True)
all_dt2["reviews"].fillna(mean_reviews, inplace=True)
all_dt2["sold_past_month"].fillna(mean_spm, inplace=True)

all_dt2['list_price'] = all_dt2[['list_price', 'price']].apply(lambda x: x['list_price'] if x['list_price'] > x['price'] else x['price'], axis=1)
all_dt2

Unnamed: 0,price,list_price,rating,reviews,sold_past_month,is_bestseller
0,199.98,679.990000,4.4,48.0,100.0,False
1,279.99,321.990000,4.4,39243.0,8000.0,False
2,649.99,1699.990000,4.8,18.0,100.0,False
3,197.35,269.000000,4.1,1678.0,1000.0,False
4,176.00,209.990000,4.0,1861.0,4000.0,False
...,...,...,...,...,...,...
1237,162.00,180.440000,4.3,46.0,500.0,False
1238,85.99,209.351644,4.4,1092.0,3000.0,False
1239,299.99,299.990000,4.4,422.0,1000.0,False
1240,49.99,69.990000,4.0,416.0,1000.0,False


In [87]:
print(all_dt1[["price", "list_price", "rating", "reviews", "sold_past_month"]].to_numpy())

[[1.99980e+02 6.79990e+02 4.40000e+00 4.80000e+01 1.00000e+02]
 [2.79990e+02 3.21990e+02 4.40000e+00 3.92430e+04 8.00000e+03]
 [6.49990e+02 1.69999e+03 4.80000e+00 1.80000e+01 1.00000e+02]
 ...
 [8.39900e+01 9.99900e+01 4.90000e+00 9.30000e+01 5.00000e+02]
 [1.62000e+02 1.80440e+02 4.30000e+00 4.60000e+01 5.00000e+02]
 [4.99900e+01 6.99900e+01 4.00000e+00 4.16000e+02 1.00000e+03]]


In [92]:
X1 = torch.Tensor(all_dt1[["price", "list_price", "rating", "reviews", "sold_past_month"]].to_numpy())
y1 = torch.Tensor(all_dt1["is_bestseller"].to_numpy())

X2 = torch.Tensor(all_dt2[["price", "list_price", "rating", "reviews", "sold_past_month"]].to_numpy())
y2 = torch.Tensor(all_dt2["is_bestseller"].to_numpy())

In [93]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.25)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25)

In [94]:
y1_train

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 

In [95]:
class OurNonLinearNet(torch.nn.Module):
    def __init__(self, input_features, out_features, hidden_units):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_features, hidden_units)
        self.relu1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_units, hidden_units)
        self.relu2 = torch.nn.ReLU()
        self.fc3 = torch.nn.Linear(hidden_units, out_features)


    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        return x


    def predict(self, x):
        return torch.round(torch.sigmoid(self.forward(x)))


    def predict_from_logit(self, x_logit):
        return torch.round(torch.sigmoid(x_logit))

In [112]:
net1 = OurNonLinearNet(5, 1, 800)
net2 = OurNonLinearNet(5, 1, 800)
loss_fun = torch.nn.BCEWithLogitsLoss()
optimizer1 = torch.optim.SGD(net1.parameters(), lr=0.005)#, weight_decay=1.e-2, momentum=0.9
optimizer2 = torch.optim.SGD(net2.parameters(), lr=0.005)

In [113]:
def learn_process(model, optimizer, n_epochs, X_train, y_train):
    for epoch in range(n_epochs):

        model.train()

        y_logits = model(X_train).squeeze()
        #y_pred = torch.softmax(y_logits, dim=1).argmax(dim = 1)
        loss = loss_fun(y_logits, y_train)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f"Epoch {epoch}: loss = {loss:.5f}")

In [114]:
learn_process(net1, optimizer1, 1000, X1_train, y1_train)
learn_process(net2, optimizer2, 1000, X2_train, y2_train)

Epoch 0: loss = 805.11914
Epoch 10: loss = 40.57550
Epoch 20: loss = 1.53824
Epoch 30: loss = 0.17598
Epoch 40: loss = 0.17598
Epoch 50: loss = 0.17598
Epoch 60: loss = 0.17598
Epoch 70: loss = 0.17598
Epoch 80: loss = 0.17598
Epoch 90: loss = 0.17598
Epoch 100: loss = 0.17598
Epoch 110: loss = 0.17598
Epoch 120: loss = 0.17598
Epoch 130: loss = 0.17598
Epoch 140: loss = 0.17598
Epoch 150: loss = 0.17598
Epoch 160: loss = 0.17598
Epoch 170: loss = 0.17598
Epoch 180: loss = 0.17598
Epoch 190: loss = 0.17598
Epoch 200: loss = 0.17598
Epoch 210: loss = 0.17598
Epoch 220: loss = 0.17598
Epoch 230: loss = 0.17598
Epoch 240: loss = 0.17598
Epoch 250: loss = 0.17598
Epoch 260: loss = 0.17598
Epoch 270: loss = 0.17598
Epoch 280: loss = 0.17598
Epoch 290: loss = 0.17598
Epoch 300: loss = 0.17598
Epoch 310: loss = 0.17598
Epoch 320: loss = 0.17598
Epoch 330: loss = 0.17598
Epoch 340: loss = 0.17598
Epoch 350: loss = 0.17598
Epoch 360: loss = 0.17598
Epoch 370: loss = 0.17598
Epoch 380: loss = 0.

In [115]:
from sklearn.metrics import accuracy_score

In [116]:
print(accuracy_score(y1_test.detach().numpy(), net1.predict(X1_test).detach().numpy()))
print(accuracy_score(y2_test.detach().numpy(), net2.predict(X2_test).detach().numpy()))

0.9426751592356688
0.9646302250803859
