In [None]:
import pandas as pd
import joblib
import numpy as np

train = pd.read_csv('train_data.csv', sep=';', parse_dates = ['min_publish_date'])
test = pd.read_csv('test_data.csv', sep=';', parse_dates = ['min_publish_date'])
labels = pd.read_csv('train_labels.csv', sep=';')
word2inx_map = joblib.load('word2inx_map.joblib')

cat_cols = ['fz','region_code','okpd2_code','additional_code','month','has_lot']
col_cols = ['lot_price']

def process_df(train):
    train['month'] = train['min_publish_date'].dt.month
    train['okpd2_code'] = train['okpd2_code'].fillna('null_okpd2_code')
    train['additional_code'] = train['additional_code'].fillna('null_additional_code')
    train['lot_price'] = np.log1p(train['lot_price']) / 25.0
    train['region_code'] = train['region_code'].astype(str)
    train['has_lot'] = 'lot_'+train.lot_name.isnull().astype(str)
    return train

train = process_df(train)
test = process_df(test)
full_df = pd.concat([train,test], ignore_index=True).reset_index(drop=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
def make_text_field(train):
    train['text'] = train['purchase_name'].fillna('') + ' ' + train['item_descriptions'].fillna('')
    train['text'] = train['text'].fillna('').str.lower()\
.str.replace(r'[^а-яА-Я0-9\s]', ' ',regex=True).str.replace(r'\s',' ',regex=True)\
.str.replace('  ', ' ',regex=False).str.replace('  ', ' ',regex=False).str.replace('  ', ' ',regex=False).str.strip()
    return train

full_df = make_text_field(full_df)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FactorAnalysis

print('tfidf')
vectorizer = TfidfVectorizer(min_df=10)
vectors = vectorizer.fit_transform(full_df['text'].values)

print('svd')
svd = TruncatedSVD(n_components=64)
comps = svd.fit_transform(vectors)

for i in range(comps.shape[1]):
    full_df['comp_'+str(i)] = comps[:,i]
    train['comp_'+str(i)] = comps[:train.shape[0],i]
    test['comp_'+str(i)] = comps[-test.shape[0]:,i]
    col_cols.append('comp_'+str(i))

tfidf
svd


In [None]:
import multiprocessing
import os
import pickle
import random
import math
from typing import Any, Dict, List, Optional, Tuple, Union

import nmslib
import numpy as np
import pandas as pd
import requests
import scipy
from IPython.display import Image, display
from lightfm import LightFM
from lightfm import evaluation
from lightfm.data import Dataset
from sklearn import preprocessing
from tqdm.auto import tqdm

In [None]:
print(test.min_publish_date.min(),
      test.min_publish_date.max())

2020-05-01 00:00:00 2020-12-02 00:00:00


In [None]:
train_df = train.loc[train.min_publish_date < pd.to_datetime('2020-03-20')].reset_index(drop=True)
valid_df = train.loc[train.min_publish_date >= pd.to_datetime('2020-03-20')].reset_index(drop=True)
print(valid_df.shape)

train_lots = train_df.pn_lot_anon.unique().tolist()
train_labels_df = labels.loc[labels.pn_lot_anon.isin(train_lots)].reset_index(drop=True)
train_users = train_labels_df.participant_inn_kpp_anon.unique().tolist()
valid_labels_df = labels.loc[(~labels.pn_lot_anon.isin(train_lots)) &\
                             (labels.participant_inn_kpp_anon.isin(train_users))].reset_index(drop=True)

valid_users = valid_labels_df.participant_inn_kpp_anon.unique().tolist()
train_labels_df = labels.loc[labels.participant_inn_kpp_anon.isin(valid_users)].reset_index(drop=True)
print(train_labels_df.shape, valid_labels_df.shape)

(81455, 78)
(860817, 4) (99730, 4)


In [None]:
cols = ['pn_lot_anon'] + cat_cols + col_cols

train_labels_df = train_labels_df.drop(['fz'], axis=1).merge(train[cols], on='pn_lot_anon', how='left')
valid_labels_df = valid_labels_df.drop(['fz'], axis=1).merge(train[cols], on='pn_lot_anon', how='left')
train_labels_df.head()

Unnamed: 0,pn_lot_anon,participant_inn_kpp_anon,is_winner,fz,region_code,okpd2_code,additional_code,month,has_lot,lot_price,...,comp_54,comp_55,comp_56,comp_57,comp_58,comp_59,comp_60,comp_61,comp_62,comp_63
0,pn_lot_18161,inn_kpp_977300,0,223fz,71,22.2,22.2,1,lot_False,0.550997,...,-0.009,-0.000842,-0.00125,0.001655,0.001888,-8.1e-05,0.000814,-0.004965,-0.002322,0.002304
1,pn_lot_9818628,inn_kpp_977300,1,223fz,30,32.9,46.9,2,lot_False,0.492426,...,0.010582,-0.002633,-0.00527,0.006629,0.013253,0.002532,0.013538,-0.005477,-0.005204,-0.0042
2,pn_lot_396434,inn_kpp_977300,1,223fz,71,22.2,22.2,3,lot_False,0.54113,...,-0.009,-0.000842,-0.00125,0.001655,0.001888,-8.1e-05,0.000814,-0.004965,-0.002322,0.002304
3,pn_lot_8216428,inn_kpp_977300,0,223fz,66,25.9,25.9,4,lot_False,0.41469,...,0.015902,-0.007585,0.026906,0.007619,-0.005795,0.00978,0.014034,-0.011962,0.003909,0.01451
4,pn_lot_434055,inn_kpp_977300,0,223fz,69,46.1,46.1,11,lot_False,0.580878,...,0.01324,0.028761,0.084496,0.000643,-0.06304,-0.020718,-0.027965,0.022147,-0.014662,0.086964


In [None]:
dataset = Dataset()

cat_features = cat_cols
cont_features = col_cols

item_features = []

for ft in cat_features:
    for fstr in full_df[ft].unique():
        item_features.append(fstr)

for ft in cont_features:
    item_features.append(ft)

num_unique_user_id = train_labels_df.participant_inn_kpp_anon.nunique()
num_unique_item_id = train_labels_df.pn_lot_anon.nunique() + \
valid_labels_df.pn_lot_anon.nunique() + test.pn_lot_anon.nunique()

inx2user_map = train_labels_df.participant_inn_kpp_anon.unique()
inx2item_map = train_labels_df.pn_lot_anon.unique().tolist() + \
valid_labels_df.pn_lot_anon.unique().tolist() + test.pn_lot_anon.unique().tolist()

user2inx_map = dict(zip(inx2user_map, np.arange(num_unique_user_id)))
item2inx_map = dict(zip(inx2item_map, np.arange(num_unique_item_id)))

dataset = Dataset()
dataset.fit(users=inx2user_map, items=inx2item_map, item_features=item_features)

In [None]:
item_df = pd.DataFrame({'pn_lot_anon': inx2item_map})
item_df = item_df.merge(full_df[cols], on='pn_lot_anon', how='left')

In [None]:
def item_features_iter(user_df, cat_features, cont_features):
    for index, row in user_df.iterrows():
        yield((row["pn_lot_anon"], [row[ft] for ft in cat_features]))
        for ft in cont_features:
            yield((row["pn_lot_anon"], {ft: float(row[ft])}))

train_gen = item_features_iter(item_df, cat_features, cont_features)
train_item_features = dataset.build_item_features(train_gen)
train_item_features.shape

(811751, 812521)

In [None]:
train_item_participations = train_labels_df.groupby('pn_lot_anon').\
agg({'participant_inn_kpp_anon':'count'}).to_dict()['participant_inn_kpp_anon']
train_item_participations['pn_lot_100485']

3

In [None]:
train_user_participations = train_labels_df.groupby('participant_inn_kpp_anon').\
agg({'pn_lot_anon':'count'}).to_dict()['pn_lot_anon']

In [None]:
train_labels_df.groupby('pn_lot_anon').\
agg({'participant_inn_kpp_anon':'count'}).median()

participant_inn_kpp_anon    1.0
dtype: float64

In [None]:
train_labels_df.groupby('participant_inn_kpp_anon').\
agg({'pn_lot_anon':'count'}).median()

pn_lot_anon    31.0
dtype: float64

In [None]:
interactions = dataset.build_interactions([(row["participant_inn_kpp_anon"], 
                                            row["pn_lot_anon"], 
                                            1.0 + row['is_winner'] * train_item_participations[row["pn_lot_anon"]]) 
                                           for index, row in train_labels_df.iterrows()])

In [None]:
user_mappings1, _, item_mappings1, _ = dataset.mapping()
item_mappings = {k: d for d,k in item_mappings1.items()}
user_mappings = {k: d for d,k in user_mappings1.items()}

In [None]:
inn = user_mappings[0]
valid_interactions = dataset.build_interactions([(inn, x) for x in valid_labels_df.pn_lot_anon.unique()])
valid_users, valid_items = valid_interactions[0].nonzero()

In [None]:
import tqdm
from scipy.special import softmax

for num_epoch in [100]:
    model = LightFM(loss='warp', no_components=256, random_state=239)
    model.fit(interactions=interactions[0], 
              item_features=train_item_features, 
              sample_weight=interactions[1],
              epochs=num_epoch, num_threads=8, verbose=True)

    r,g = 0,0
    pred_lots = []
    true_lots = set(valid_labels_df.pn_lot_anon.unique())
    preds = []
    for inn_idx in tqdm.notebook.tqdm(range(len(user_mappings))): 
        inn = user_mappings[inn_idx]
        pred = model.predict(inn_idx, valid_items, item_features=train_item_features, num_threads=8)
        pred = softmax(pred)
        idxs = np.argsort(pred)[-35:]
        lots = [(inn, item_mappings[valid_items[x]], pred[x]*1000.0) for x in idxs]
        pred_lots.extend(lots)
    result = pd.DataFrame(pred_lots)
    result.columns = ['participant_inn_kpp_anon','pn_lot_anon','similarity_score']
    c1,c2 = 0,0
    for inn,df in tqdm.notebook.tqdm(result.groupby('participant_inn_kpp_anon')):
        true_lots = set(valid_labels_df.loc[valid_labels_df.participant_inn_kpp_anon == inn].pn_lot_anon.unique())
        pred_lots = set(df.pn_lot_anon.unique())
        c1 += len((true_lots & pred_lots))
        c2 += len(true_lots)
    print(num_epoch,c1/c2)

Epoch: 100%|██████████| 100/100 [19:50<00:00, 11.90s/it]


  0%|          | 0/10608 [00:00<?, ?it/s]

  0%|          | 0/10608 [00:00<?, ?it/s]

100 0.3982047939023167
