# Notebook to test a ranker based on scikit-learn models

In [1]:
%load_ext autoreload
%autoreload 2

## Load data

In [2]:
from data import load_data

train, val, test = load_data()

Load proposals

In [3]:
from postprocessing import read_proposals_files

In [4]:
files = [
    "BART-base-submission-23.csv", 
    "BART-large-submission-21.csv",
    "pegasus-large-submission-30.csv",
    "t5-base-submission-40.csv",
    "t5-small-submission-36.csv",
]

In [5]:
train["name_proposals"] = read_proposals_files([f"train_{file}" for file in files])
train

Unnamed: 0,name,description,name_proposals
25291,fabric tote bag,tote bag in a combination of colours. braided ...,"[braided tote bag with pockets, tote bag with ..."
5328,knit cardigan with ruffle trims,knit cardigan with a round neck. featuring lon...,"[cable-knit cardigan, knit cardigan with frill..."
28974,mercurised glass soap dish,mercurised glass soap dish.,"[, mercurised glass bar soap dish, merised gla..."
10697,joggers,relaxed fit trousers made of a linen blend. fe...,"[essentials linen blend trousers, striped rust..."
8656,ribbed knit cardigan,"cardigan with round neckline, long sleeves and...","[cable-knit cardigan, jacquilted knit cardigan..."
...,...,...,...
7820,ripped skinny jeans,five-pocket skinny jeans with an adjustable in...,"[black indigo skinny jeans, ripped jeans, two-..."
15507,flannel shirt with stand-up collar,regular fit shirt with a flannel finish. stand...,"[paisley print shirt, short sleeve shirt, text..."
17874,basic denim jacket,collared denim jacket featuring long buttoned ...,"[faux shearling jacket, denim jacket with coll..."
28245,low rattan basket with lid,seagrass basket in two tones with lid and hand...,"[double seagrass basket, matte rattan basket, ..."


In [6]:
val["name_proposals"] = read_proposals_files([f"val_{file}" for file in files])
val

Unnamed: 0,name,description,name_proposals
1190,t-shirt with polka dot flocking,round neck t-shirt featuring long sleeves with...,"[polka dot t-shirt, frilled t-shirt, t-shirt w..."
2774,double strap wrap dress trf,short v-neck dress featuring double straps and...,"[dress trf, , limited edition dress, poplin dr..."
32709,cloud design bedspread with metallic thread,bedspread with a cloud design and metallic thr...,"[cloud knit bedspread, cloud bedspread with me..."
32403,ceramic door knob with transfer (pack of 2),round white ceramic door knob featuring a blue...,"[ceramic door knob with black transfer, cerami..."
14237,puffer jacket,long sleeve puffer jacket featuring a detachab...,"[faux fur puffer jacket, faux fur jacket trf, ..."
...,...,...,...
31096,water lily voile dress,children's dress featuring a water lily print ...,"[water lily print fabric dress, water lily bab..."
9682,striped sweatshirt,long sleeve hoodie. button fastening on the yo...,"[slogan hoodie, midi sweatshirt, check hoodie,..."
2355,ruffled t-shirt trf,round neck t-shirt with short sleeves and a ru...,"[plain ruffled t-shirt, polka dot t-shirt, fri..."
31358,pine cone and sleigh bells napkin holders (pac...,"napkin holder with faux twigs, pine cone, slei...",[faux fur napkin holder with bells (pack of 2)...


## Data preprocessing

In [7]:
from itertools import chain

all_train_texts = list(train["name"].values) + list(train["description"].values) + list(chain(*train["name_proposals"].values))
all_train_texts = set(all_train_texts)

## Unroll training and validation data

In [8]:
from postprocessing import unroll_data, unroll_test_data

In [18]:
unrolled_train = unroll_data(train["name"], train["description"], train["name_proposals"])
unrolled_train = unrolled_train[unrolled_train["label"] == 1].append(unrolled_train[unrolled_train["label"] == 0].sample(frac=0.1)).sample(frac=1)
unrolled_train

Unnamed: 0,name,description,label
1531367,voluminous printed top,"straight neck top with long puff sleeves, elas...",0
996682,flared trousers - limited edition,semi-sheer high-waist trousers featuring an el...,0
3149747,scalloped design fruit bowl,round fruit bowl in gold metal with scalloped ...,0
815910,houndstooth shorts,shorts made of a wool blend. high waist with a...,0
1846676,flared jeans,jeans with an adjustable inner waistband and f...,1
...,...,...,...
2831401,leaf print sleepsuit,round neck sleepsuit with long sleeves and tur...,0
295014,belt bag with flap,nylon belt bag. main compartment with zip clos...,0
100348,round teak table,"round tray with a simple design and a stand, m...",0
3013191,mules with mouse motif,mug with a lid and a mouse motif.,0


In [19]:
unrolled_val = unroll_data(val["name"], val["description"], val["name_proposals"])
unrolled_val

Unnamed: 0,name,description,label
0,t-shirt with polka dot flocking,round neck t-shirt featuring long sleeves with...,1
1,polka dot t-shirt,round neck t-shirt featuring long sleeves with...,0
2,t-shirt with frills,round neck t-shirt featuring long sleeves with...,0
3,frilled t-shirt,round neck t-shirt featuring long sleeves with...,0
4,t-shirt with ruffles,round neck t-shirt featuring long sleeves with...,0
...,...,...,...
357528,sequinverted textured sweatshirt,"long sleeve sweatshirt with a round neckline, ...",0
357529,sequin-trimmed sweatshirt,"long sleeve sweatshirt with a round neckline, ...",0
357530,twill sweatshirt with sequins,"long sleeve sweatshirt with a round neckline, ...",0
357531,Sequinned sweatshirt,"long sleeve sweatshirt with a round neckline, ...",0


## Model

In [20]:
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

In [21]:
from scipy.sparse import hstack

class NgramJoinTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        assert X.shape[0] > 0
        assert X.shape[1] % 2 == 0
        n_features = X.shape[1] // 2
        new_features = X[:, :n_features].multiply(X[:, n_features:])
        #return hstack([X, new_features])
        return new_features

In [22]:
vectorizer = HashingVectorizer(analyzer="char", ngram_range=(1, 7), binary=True, alternate_sign=False)

feature_generator = Pipeline([
    ('vectorizer', ColumnTransformer([
        ("name_vectorizer", vectorizer, "name"),
        ("description_vectorizer", vectorizer, "description")
        ])
    ),
    ('ngram_join', NgramJoinTransformer())
])

In [23]:
%%time
XF_train = feature_generator.fit_transform(unrolled_train[["name", "description"]])

CPU times: user 2min 57s, sys: 19.6 s, total: 3min 16s
Wall time: 3min 16s


In [24]:
%%time
XF_val = feature_generator.transform(unrolled_val[["name", "description"]])

CPU times: user 2min 50s, sys: 18.8 s, total: 3min 9s
Wall time: 3min 9s


In [25]:
%%time
from scipy.sparse import vstack

base_estimator = XGBClassifier(n_jobs=4, use_label_encoder=False)
param_grid = {
    'n_estimators': [10, 20, 50, 100, 200, 500, 1000],
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'gamma': [0, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3],
    'lambda': [0, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3],
    'alpha': [0, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_child_weight': [1, 5, 10, 20, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}
model = RandomizedSearchCV(
    base_estimator, 
    param_grid, 
    n_iter=50, 
    scoring=make_scorer(roc_auc_score), 
    verbose=10, 
    n_jobs=5,
    cv=PredefinedSplit([-1] * len(unrolled_train) + [0] * len(unrolled_val)),
    refit=False
)
model.fit(vstack([XF_train, XF_val]), list(unrolled_train["label"].values) + list(unrolled_val["label"].values))

Fitting 1 folds for each of 50 candidates, totalling 50 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:  3.9min
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed: 22.4min
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed: 37.5min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed: 45.1min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed: 115.8min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 127.6min
[Parallel(n_jobs=5)]: Done  47 out of  50 | elapsed: 147.0min remaining:  9.4min
[Parallel(n_jobs=5)]: Done  50 out of  50 | elapsed: 234.7min finished


CPU times: user 2min 40s, sys: 7.04 s, total: 2min 47s
Wall time: 3h 54min 41s


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing...
                                        'colsample_bytree': [0.6, 0.8, 1.0],
                                        'gamma': [0, 0.001, 0.01, 0.1, 1, 10.0,
                                                  100.0, 1000.0],
                                        

In [26]:
print(model.best_params_)

{'subsample': 1.0, 'n_estimators': 1000, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.8, 'lambda': 0, 'gamma': 1, 'colsample_bytree': 0.8, 'alpha': 0.001}


In [27]:
best_estimator = XGBClassifier(n_jobs=20, use_label_encoder=False, **model.best_params_).fit(XF_train, unrolled_train["label"].values)



In [28]:
import pickle

with open("ranker.pkl", "wb") as f:
    pickle.dump(best_estimator, f)

In [29]:
preds = best_estimator.predict_proba(XF_val)
print(roc_auc_score(unrolled_val["label"], preds[:, 1]))

0.5966983830439138


In [30]:
preds = best_estimator.predict_proba(XF_train)
print(roc_auc_score(unrolled_train["label"], preds[:, 1]))

0.9624394327550008


## Sort predictions

In [31]:
submissions = ["submission_21.csv", "submission_23.csv", "submission_30.csv", "submission_40.csv", "submission_36.csv"]
test["name_proposals"] = read_proposals_files(submissions, remove_duplicates=True)
test

Unnamed: 0,description,name_proposals
0,"knit midi dress with a v-neckline, straps and ...","[lace dress trf, ribbed dress trf, lace knit d..."
1,"loose-fitting dress with a round neckline, lon...","[floral print dress, oversized dress with plea..."
2,nautical cap with peak.this item must be retur...,"[long nautical cap, navy cap, nautical cap, fa..."
3,nautical cap with peak. adjustable inner strap...,"[long nautical cap, combined nautical cap, nau..."
4,nautical cap with side button detail.this item...,"[seamed nautical cap, nautical cap with side b..."
...,...,...
1436,striped print cotton cushion cover. cushion fi...,"[striped textured cushion cover, striped cotto..."
1437,rectangular cushion featuring a gnome print.,"[noggin cushion, geometric gnome cushion, limi..."
1438,cotton jersey eye mask featuring an elastic ba...,"[c ocular mask, jersey eye mask, cotton jersey..."
1439,padded chipboard hanger featuring an iron hook...,"[padded chipboard hanger (set of 3), paisley j..."


In [32]:
unrolled_test = unroll_test_data(test["description"], test["name_proposals"])
unrolled_test

Unnamed: 0,name,description,original_row
0,lace dress trf,"knit midi dress with a v-neckline, straps and ...",0
1,ribbed dress trf,"knit midi dress with a v-neckline, straps and ...",0
2,lace knit dress,"knit midi dress with a v-neckline, straps and ...",0
3,contrast knit dress,"knit midi dress with a v-neckline, straps and ...",0
4,knit dress with matching detail,"knit midi dress with a v-neckline, straps and ...",0
...,...,...,...
49949,iron and wood clothes hanger (pack of 5),iron hanger suitable for hanging all kinds of ...,1440
49950,textured iron hanger (set of 5),iron hanger suitable for hanging all kinds of ...,1440
49951,slim fit iron hanger (pack of 5),iron hanger suitable for hanging all kinds of ...,1440
49952,super thin iron hanger (pack of 5),iron hanger suitable for hanging all kinds of ...,1440


In [33]:
%%time
XF_test = feature_generator.transform(unrolled_test[["name", "description"]])

CPU times: user 58.2 s, sys: 3.56 s, total: 1min 1s
Wall time: 1min 3s


In [34]:
%%time
unrolled_test["score"] = best_estimator.predict_proba(XF_test)[:, 1]
unrolled_test

CPU times: user 20.5 s, sys: 1.81 s, total: 22.4 s
Wall time: 5.76 s


Unnamed: 0,name,description,original_row,score
0,lace dress trf,"knit midi dress with a v-neckline, straps and ...",0,0.593053
1,ribbed dress trf,"knit midi dress with a v-neckline, straps and ...",0,0.054891
2,lace knit dress,"knit midi dress with a v-neckline, straps and ...",0,0.122756
3,contrast knit dress,"knit midi dress with a v-neckline, straps and ...",0,0.020826
4,knit dress with matching detail,"knit midi dress with a v-neckline, straps and ...",0,0.519997
...,...,...,...,...
49949,iron and wood clothes hanger (pack of 5),iron hanger suitable for hanging all kinds of ...,1440,0.001595
49950,textured iron hanger (set of 5),iron hanger suitable for hanging all kinds of ...,1440,0.069920
49951,slim fit iron hanger (pack of 5),iron hanger suitable for hanging all kinds of ...,1440,0.037505
49952,super thin iron hanger (pack of 5),iron hanger suitable for hanging all kinds of ...,1440,0.003563


In [35]:
from postprocessing import reroll_score_test_data

proposals = reroll_score_test_data(unrolled_test)
proposals

0       [lace dress trf, knit dress with matching lace...
1       [long pleated dress trf, animal print dress tr...
2       [limited edition nautical cap, asymmetric naut...
3       [n nautical cap, c nautical cap, cap with inne...
4       [nautical cap with flap, seashell nautical cap...
                              ...                        
1436    [cushion cover with striped print, striped sof...
1437    [nome cushion, noggy cushion, accesories 05, a...
1438    [cotton jersey eye mask, botton jersey eye mas...
1439    [paisley jacquard padded hanger (pack of 3), p...
1440    [iron hanger (set of 5), slim hanger (pack of ...
Name: proposals, Length: 1441, dtype: object

In [36]:
from postprocessing import save_submission

save_submission(proposals.values, "submission_44")
save_submission(proposals.values, "submission_44", zip=False)

## Results

|Model|Train AUC|Val AUC|
|-----|---------|-------|
|XGB 10 trees|0.576601653865533|0.5696578534991624|
|XGB 100 trees|0.6027082266547432|0.6885317734719824|
|XGB 100 trees|0.7342786432389221|0.605825065246392|