In [1]:
import os
os.chdir("/Users/karol/Desktop/Antwerp/ai_project")
import torch
import torch
import torch.nn as nn
from model import MLP1, TwoTower, TwoTowerBasic
from data_reader import load_data, data_preprocessing, load_data_mf, load_customers_articles, customer_buckets, matrix_representation, create_random_candidates
from helper import validate_softmax,  train_softmax, train_two_tower
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from recommenders import recommender_softmax, recommender_two_towers
import pickle

# Data Preprocessing

### Assumptions
- Most important outcomes from feature engineering part were applied in data preprocessing function.
- Function split_transaction generates targets (last purchases are considered as targets).
- Function atrix factorization transform transaction to pivot matrix.
- Create specific Dataset classes for specific task.
- Use sparse matrices to handle sparse data (create collate_fn for DataLoaders).

### Preprocess and save data

In [2]:
# transactions, articles, customers, article_encodings, customer_encodings, article_decodings, customer_decodings = data_preprocessing(feature_generation=False, return_encodings=True, save=True)
# transactions_candidates = create_random_candidates(transactions, save_dir=None, num_sample=30_000_000)

### Load preprocessed data and apply one hot encoding for articles and customers

In [2]:
# read data
transactions = pd.read_csv("data/preprocessed/transactions.csv") 
articles = pd.read_csv("data/preprocessed/articles.csv") 
customers = pd.read_csv("data/preprocessed/customers.csv") 

# one hot encoding 
articles = articles.set_index("article_id")
customers = customers.set_index("customer_id")

article_enc = OneHotEncoder(sparse_output=True)
articles = article_enc.fit_transform(articles)

customers_categorical = ["FN",'Active',"club_member_status", "fashion_news_frequency"]
customers_cont = ["age"]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), customers_categorical),
        ('cont', 'passthrough', customers_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)
customers = csr_matrix(preprocessor.fit_transform(customers))

# Softmax DNN

### Assumptions
- The input is the purchase history without last purchase,
- The target is the basket of the last purchase,
- We are interested in whether article was bought not its amount. Therefore, the binary values are allowed.
- In the target basket multiple articles could be bought therefore the sigmoid activation function was used at the last layer. It assumes that decision to buy specific article is independent from other products that customer is buying.
- As I have multi-classification problem and I use sigmoid asa final activation layer I decided to use BCEWithLogitsLoss which supposed to deal with this configuration.


### Load Data & Create Model

In [10]:
train_dataloader, val_dataloader = load_data(transactions, train_test=True, batch_size=1000)

### Create & Train basic Model 

In [12]:
# create model
model = MLP1(input_dim=articles.shape[0], output_dim=articles.shape[0])
# get params
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
save_dir = "AI_project/RQ1/models/MLP1.pt"
# train
val_loss_MLP = train_softmax(model, train_dataloader, val_dataloader, criterion, optimizer, save_dir, num_epochs=10)

 10%|█         | 1/10 [17:28<2:37:15, 1048.37s/it]

Epoch [1/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 20%|██        | 2/10 [34:39<2:18:26, 1038.34s/it]

Epoch [2/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 30%|███       | 3/10 [51:37<2:00:02, 1028.93s/it]

Epoch [3/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 40%|████      | 4/10 [1:08:35<1:42:28, 1024.78s/it]

Epoch [4/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 50%|█████     | 5/10 [1:25:33<1:25:10, 1022.08s/it]

Epoch [5/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 60%|██████    | 6/10 [1:42:28<1:07:59, 1019.89s/it]

Epoch [6/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 70%|███████   | 7/10 [1:59:24<50:56, 1018.67s/it]  

Epoch [7/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 80%|████████  | 8/10 [2:16:20<33:55, 1017.60s/it]

Epoch [8/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


 90%|█████████ | 9/10 [2:33:17<16:57, 1017.35s/it]

Epoch [9/10] - Train Loss: 0.6931, Validation Loss: 0.6931%


100%|██████████| 10/10 [2:50:11<00:00, 1021.13s/it]

Epoch [10/10] - Train Loss: 0.6931, Validation Loss: 0.6931%





### Load trained model

In [4]:
MLP1 = torch.load("AI_project/RQ1/models/MLP1.pt")

### Generate recommendations for test data

In [5]:
recommendations, accuracy = recommender_softmax(MLP1, val_dataloader, evaluate=True)
accuracy

  i = torch.LongTensor(indices)
  return torch.sparse.FloatTensor(i, v, s)
  nonzero_finite_vals = torch.masked_select(


tensor(0.0028, device='mps:0')

### Ideas for improvements:
- No warm start
- Model should be trained on customers who had at least two purchases
- We take into account baskets which are based on the past 2 years. Maybe we should train model based on customers who bought articles in last month.
- Different way to measure accuracy.
- Customers whou didn't developed any patterns (bought low amount of clothes) should be recommended with current top selling articles.
- Develop more complex (deeper) model.
- More training.
- Distinguish customers who buys specific articles multiple times.
### Potential Issues:
- We need to predict articles for customers who were used to train the model.

**Idea**: recommend things that haven't been bought.



# Matrix Factorization with DNN

### Assumptions
- Firstly, it was decided to use one hot encoding for all categorical features.
- Two tower architecture was used which is compsed from two different classes, which are responsible for encoding customer and article features. 
- These models are used for estimating embeddings for recommendations.
- To estimate the probability of buying article x by customer y the product between corresponding embbedings is calculated and then the sigmoid function is applied. 
- For training purposes the random negative candidates have been generated.
- Weights for recent articles

### Load data

In [13]:
transactions_candidates = pd.read_csv("data/preprocessed/transactions_candidates.csv")
train_dataloader, val_dataloader, test_customers = load_data_mf(transactions_candidates, batch_size=1000)

### Train Model

In [14]:
input_article_dim = articles.shape[1]
input_customer_dim = customers.shape[1]
model = TwoTower(input_article_dim, input_customer_dim, output_dim=3)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
save_dir = "AI_project/RQ1/models/TwoTower1.pt"
val_loss_tower = train_two_tower(model, customers, articles, train_dataloader, val_dataloader, criterion, optimizer, save_dir, num_epochs=10)

100%|██████████| 55595/55595 [12:57<00:00, 71.47it/s]


Epoch [1/10] - Train Loss: 0.1969, Validation Loss: 0.1761%


100%|██████████| 55595/55595 [12:57<00:00, 71.47it/s]


Epoch [2/10] - Train Loss: 0.1975, Validation Loss: 0.1760%


100%|██████████| 55595/55595 [12:58<00:00, 71.44it/s]


Epoch [3/10] - Train Loss: 0.1971, Validation Loss: 0.1760%


100%|██████████| 55595/55595 [12:57<00:00, 71.48it/s]


Epoch [4/10] - Train Loss: 0.1970, Validation Loss: 0.1760%


100%|██████████| 55595/55595 [12:58<00:00, 71.42it/s]


Epoch [5/10] - Train Loss: 0.1969, Validation Loss: 0.1760%


100%|██████████| 55595/55595 [12:58<00:00, 71.45it/s]


Epoch [6/10] - Train Loss: 0.1971, Validation Loss: 0.1759%


100%|██████████| 55595/55595 [12:58<00:00, 71.44it/s]


Epoch [7/10] - Train Loss: 0.1975, Validation Loss: 0.1759%


100%|██████████| 55595/55595 [12:57<00:00, 71.48it/s]


Epoch [8/10] - Train Loss: 0.1976, Validation Loss: 0.1759%


100%|██████████| 55595/55595 [12:58<00:00, 71.43it/s]


Epoch [9/10] - Train Loss: 0.1977, Validation Loss: 0.1759%


100%|██████████| 55595/55595 [12:58<00:00, 71.44it/s]


Epoch [10/10] - Train Loss: 0.1978, Validation Loss: 0.1759%


### Load Model

In [15]:
TwoTower1 = torch.load("AI_project/RQ1/models/TwoTower1.pt")

### Load data required for recommendations

In [16]:
matrix_full = matrix_representation(transactions_candidates, train_test=False)
targets = matrix_full[test_customers]
dataloader_cust, dataloader_art = load_customers_articles(customers, articles, test_customers=test_customers, batch_size=100)

### Evaluate Recommendations

In [17]:
recommendations, accuracy = recommender_two_towers(TwoTower1, dataloader_cust, dataloader_art, targets, evaluate=True, top_k=5)

Generate Customer Embeddings...


100%|██████████| 1363/1363 [00:06<00:00, 222.55it/s]


Generate Articles Embeddings...


100%|██████████| 1056/1056 [00:04<00:00, 249.44it/s]


Get recommendations...


100%|██████████| 137/137 [00:27<00:00,  4.97it/s]


### Ideas for improvements:
- Use embeddings for warm start
- Develop ebedding layers.
- Generate more features.
- More training.
- Distinguish customers who buys specific articles multiple times.
- Use article embeddings for recommendations.
- Mix ways of recommending things
### Potential Issues:
- We need to predict articles for customers who were used to train the model.

**Idea**: recommend things that haven't been bought.


# Get Final Recommendations

### MLP

In [20]:
transactions = pd.read_csv("data/preprocessed/transactions.csv") 
dataloader = load_data(transactions, train_test=False, batch_size=1000)
MLP1 = torch.load("AI_project/RQ1/models/MLP1.pt")
recommendations = recommender_softmax(MLP1, dataloader, evaluate=False, top_k=10).to(torch.int64).to("cpu").numpy()
output = pd.DataFrame(recommendations).reset_index()

In [21]:
with open('data/preprocessed/customers_decoding.pickle', 'rb') as file:
    customer_dict = pickle.load(file)

with open('data/preprocessed/articles_decoding.pickle', 'rb') as file:
    article_dict = pickle.load(file)

In [22]:
for i in np.arange(1,10,1):
    output[i] = output[i].apply(lambda x: article_dict["article_id"][x])
preds = [' '.join(['0' + str(article_dict["article_id"][p]) for p in ps]) for ps in recommendations]
submission = pd.DataFrame(zip(np.arange(len(preds)), preds), columns=["customer_id","prediction"])
submission["customer_id"] = submission["customer_id"].apply(lambda x: customer_dict["customer_id"][x])
submission.to_csv("submission/MLP/MLP1_submission.csv.gz", index=False)


### Two Tower

In [23]:
transactions = pd.read_csv("data/preprocessed/transactions.csv") 
dataloader_cust, dataloader_art = load_customers_articles(customers, articles, batch_size=1000)
TwoTower = torch.load("AI_project/RQ1/models/TwoTower1.pt")
recommendations = recommender_two_towers(TwoTower, dataloader_cust, dataloader_art, targets=None, top_k=10).to(torch.int64).to("cpu").numpy()
output = pd.DataFrame(recommendations).reset_index()

Generate Customer Embeddings...


100%|██████████| 1372/1372 [01:04<00:00, 21.38it/s]


Generate Articles Embeddings...


100%|██████████| 106/106 [00:03<00:00, 28.06it/s]


Get recommendations...


100%|██████████| 1372/1372 [04:49<00:00,  4.74it/s]


In [24]:
for i in np.arange(1,10,1):
    output[i] = output[i].apply(lambda x: article_dict["article_id"][x])
    
preds = [' '.join(['0' + str(article_dict["article_id"][p]) for p in ps]) for ps in recommendations]
submission = pd.DataFrame(zip(np.arange(len(preds)), preds), columns=["customer_id","prediction"])
submission["customer_id"] = submission["customer_id"].apply(lambda x: customer_dict["customer_id"][x])
submission.to_csv("submission/TwoTower/TwoTower2_submission.csv.gz", index=False)