In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

SEED = 1000

In [2]:
data = pd.read_csv('suggest_train.csv')
data = data.drop(columns=['item_id']).drop_duplicates()
# train_data = data

In [3]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=SEED)

In [4]:
test_data = pd.read_csv('suggest_test.csv')

In [5]:
train_titles = train_data.title
val_titles = val_data.title

y_train = train_data.category_id
y_val = val_data.category_id

In [6]:
import re
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models.callbacks import CallbackAny2Vec

In [7]:
class LossLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss
        

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch}')
        self.epoch += 1

In [8]:
WORD_PATTERN = '(?u)\\b\\w\\w+\\b'  # паттерн для токенизации возьмем из векторайзеров sklearn
reg_exp = re.compile(pattern=WORD_PATTERN)  # скомпилируем регулярное выражение
sentences = [reg_exp.findall(s.lower()) for s in train_data.title]

In [9]:
sentences[:5]

[['свадебные', 'бокалы'],
 ['слон', 'качалка'],
 ['ножки', 'для', 'ванной'],
 ['20', '01', 'на', '12', 'дней', 'вьетнам', 'galaxy', 'hotel'],
 ['продам', 'шкатулку']]

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
# w2v_model_1 = FastText(window=3)
w2v_model_2 = Word2Vec()

In [12]:
# w2v_model_1.build_vocab(sentences)
# w2v_model_1.train(
#     corpus_iterable=sentences,
#     total_examples=w2v_model_1.corpus_count,
#     epochs=30,
#     compute_loss=True,
#     callbacks=[LossLogger()]
# )

In [13]:
# word_to_vec
w2v_model_2.build_vocab(sentences)
w2v_model_2.train(
    corpus_iterable=sentences,
    total_examples=w2v_model_2.corpus_count,
    epochs=50,
    compute_loss=True,
    callbacks=[LossLogger()]
)

Loss after epoch 0: 44834.48046875
Loss after epoch 1: 45136.95703125
Loss after epoch 2: 43011.890625
Loss after epoch 3: 37886.03125
Loss after epoch 4: 34526.390625
Loss after epoch 5: 32910.09375
Loss after epoch 6: 31464.71875
Loss after epoch 7: 30423.625
Loss after epoch 8: 29196.5
Loss after epoch 9: 28086.28125
Loss after epoch 10: 27185.75
Loss after epoch 11: 26178.53125
Loss after epoch 12: 25263.09375
Loss after epoch 13: 24696.09375
Loss after epoch 14: 23829.875
Loss after epoch 15: 23210.3125
Loss after epoch 16: 22411.0
Loss after epoch 17: 21815.4375
Loss after epoch 18: 21089.6875
Loss after epoch 19: 20655.0
Loss after epoch 20: 20255.5
Loss after epoch 21: 19832.25
Loss after epoch 22: 19399.8125
Loss after epoch 23: 19027.3125
Loss after epoch 24: 18850.25
Loss after epoch 25: 18451.0625
Loss after epoch 26: 18048.1875
Loss after epoch 27: 17857.6875
Loss after epoch 28: 17599.9375
Loss after epoch 29: 17257.0625
Loss after epoch 30: 17009.9375
Loss after epoch 31

(1474494, 2789000)

In [16]:
w2v_model_2.wv.similar_by_word('айфон')

[('iphone6', 0.8824732899665833),
 ('iphone', 0.876761257648468),
 ('5c', 0.8694149255752563),
 ('4s', 0.8631072044372559),
 ('чехлы', 0.8402438163757324),
 ('другие', 0.8169571757316589),
 ('5s', 0.8095609545707703),
 ('силиконовый', 0.8049678206443787),
 ('чёрный', 0.8037092089653015),
 ('пленка', 0.7992513179779053)]

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
# ohe 
ohe_title_encoder = CountVectorizer()
ohe_title_encoder.fit(train_data.title)
ohe_title_encoded_train = ohe_title_encoder.transform(train_data.title)

In [24]:
# tfidf
tfidf = TfidfTransformer(smooth_idf=False)
tfidf.fit(ohe_title_encoded_train)
tfidf_title_encoded_train = tfidf.transform(ohe_title_encoded_train)

In [25]:
import re

In [26]:
class Word2VecTransformer:
    
    def __init__(self, w2v_model,CountVectorizer,tfidf, word_pattern):
        
        self.w2v_model = w2v_model
        
        self.tfidf = tfidf
        self.CountVectorizer = CountVectorizer
        
        self.feature_names = CountVectorizer.get_feature_names_out()
        
        self.word_pattern = word_pattern
        
        self.re = re.compile(pattern=self.word_pattern)
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        
        X_countvec_transformed = self.CountVectorizer.transform(X)
        tfidf_matrix = self.tfidf.transform(X_countvec_transformed)
        
        X_transformed = np.zeros(
            (
                len(X), 
                self.w2v_model.wv.vector_size * 3
            )
        )
        for i, title in enumerate(X):
            
            title_sum_vector_1 = np.zeros((self.w2v_model.wv.vector_size,))
            title_max_vector_1 = np.zeros((self.w2v_model.wv.vector_size,))
            title_avg_vector_1 = np.zeros((self.w2v_model.wv.vector_size,))
            
            tokens = self.re.findall(title.lower())
            
            n = 0
            for token in tokens:
                # if token in self.w2v_model.wv.key_to_index:
                    # get embeding from w2v
                embeding_1 = self.w2v_model.wv.get_vector(token)

                # get weight from tfidf
                if token in self.feature_names:
                    ind = np.where(self.feature_names == token)[0][0]
                    weight = tfidf_matrix[i, ind]
                else:
                    weight = 1

                title_sum_vector_1 += weight * embeding_1
                title_max_vector_1 = np.max(np.c_[title_max_vector_1, embeding_1], axis=1)
                title_avg_vector_1 += embeding_1
                    
                n += 1
                    
                    
            if n!=0:
                title_avg_vector_1 = title_avg_vector_1 / n
                    
                            
            X_transformed[i] = np.hstack(
                (title_sum_vector_1, 
                title_max_vector_1, 
                title_avg_vector_1)
            )
        
        return X_transformed

In [29]:
len(w2v_model_2.wv.key_to_index)

1872

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
def accuracy_score_top3(y_pred: np.array, y_true: np.array):
    return np.mean((y_true.reshape(-1, 1) == y_pred).any(axis=1))

In [33]:
w2v_transformer = Word2VecTransformer(
    w2v_model=w2v_model_2, 
    CountVectorizer = ohe_title_encoder, 
    tfidf=tfidf,
    word_pattern=WORD_PATTERN
)

In [34]:
train_w2v = w2v_transformer.transform(train_titles.values)
val_w2v = w2v_transformer.transform(val_titles.values)

KeyError: "Key 'слон' not present"

In [497]:
train_w2v

array([[-0.38800251, -0.04533114, -0.01970021, ..., -0.1230011 ,
         0.02525388, -0.16313565],
       [-0.44136335,  0.03456653,  0.22971979, ..., -0.00945737,
        -0.05280595,  0.04317823],
       [-0.76579437,  0.02022166,  0.05134596, ...,  0.00675785,
        -0.11255291, -0.13456968],
       ...,
       [-0.55980463,  0.01737561,  0.06687669, ..., -0.0305613 ,
         0.11434018, -0.05742053],
       [-0.33899556, -0.09315239, -0.09609361, ..., -0.24829175,
         0.07249144, -0.32100859],
       [-0.58394822, -0.07218426,  0.21554952, ..., -0.09008254,
         0.08304159, -0.06869227]])

In [498]:
from lightgbm import LGBMClassifier

In [499]:
model_on_w2v = LGBMClassifier()
# LogisticRegression(solver='liblinear')
model_on_w2v.fit(train_w2v, y_train)

In [500]:
y_val_hat = model_on_w2v.predict_proba(val_w2v)

In [501]:
y_val_top3 = np.argsort(-y_val_hat, axis=1)[:, :3]

In [502]:
accuracy_score_top3(y_val_top3, y_val.values)

0.6922017084596308

# Test

In [392]:
test_w2v = w2v_transformer.transform(test_data.title.values)

In [393]:
y_val_hat = model_on_w2v.predict_proba(test_w2v)
y_val_top3 = np.argsort(-y_val_hat, axis=1)[:, :3]

In [394]:
y_val_top3

array([[ 5,  2, 26],
       [30, 31, 15],
       [21, 19, 37],
       ...,
       [34, 43, 32],
       [26, 17, 44],
       [52, 15, 53]])

In [395]:
pd.DataFrame(y_val_top3).to_csv('solution.csv', header=['top1', 'top2', 'top3'], index=False)