### Requirements
```
python3.6  
jupyter == 1.0.0  
pytorch == 1.7.0  
torchvision  
sklearn == 0.23.2  
lightgbm == 3.1.0  
numpy == 1.19.4  
pandas == 1.1.4  
fasttext  
textdistance[extras]  
fuzzywuzzy  
```

In [1]:
from utils import *
from joblib import dump, load

import pandas as pd
import fasttext as ft

# == Directories ==================================================================================== #
# Directories
data_path = './data'
output_path = './output'
model_path = './output/model'

# == Script Parameters ============================================================================== #
# Images
IMG_SIZE = 300
IMG_EMB_NAME = 'resnet34'
IMG_BATCH = 110

# Histograms
HIST_BIN_SIZE = 15

# Texts
TXT_N_FEATURES = 100
TXT_N_EPOCH = 1000

In [8]:
# == Testing Parameters ============================================================================= #
# Images
LOAD_IMG_FEATURES = False
img_features_path = f'{model_path}/test_img_feats_{IMG_SIZE}.joblib'
# Histograms
LOAD_HIST_FEATURES = False
hist_features_path = f'{model_path}/test_hist_feats_{IMG_SIZE}.joblib'
# Texts
LOAD_TEXT_MODEL = False
USE_CLEAN_TEXT = False
txt_emb_model_path = f'{model_path}/txt_emb.bin'
# Model
lgbm_model_path = f'{model_path}/lgbm_hyped.joblib'
lgbm_final_features = f'{model_path}/lgbm_final_features.joblib'

In [3]:
# == Loading DataFrames ============================================================================= #
train_df = pd.read_csv(f'{data_path}/new_training_set.csv', index_col=0)
sample_df = pd.read_csv(f'{data_path}/new_test_sample.csv', index_col=0)
test_df = pd.read_csv(f'{data_path}/new_test_set.csv', index_col=0)
extra_df = pd.read_csv('preprocessed_extra_titles.csv')

In [6]:
IMG_SIZE

300

In [7]:
# == Getting Image Features ========================================================================= #
img_subdir = f'{data_path}/img'
img_dir = f'{img_subdir}/img'

if LOAD_IMG_FEATURES:
    print("Loading image features from %s" % img_features_path)
    test_img_feats = load(img_features_path)
else:
    print("Recalculating test image features")
    img_model = torch_load_hub_model(IMG_EMB_NAME, cut=-1)
    test_img_feats = get_batch_image_feature_vectors(img_model, img_subdir,
                                                     img_size=IMG_SIZE, batch_size=IMG_BATCH)
    print("Saving test image features to %s" % img_features_path)
    dump(test_img_feats, img_features_path)

Recalculating test image features


Using cache found in /home/budiryan/.cache/torch/hub/pytorch_vision_v0.6.0


Saving test image features to ./output/model/test_img_feats_300.joblib


In [9]:
# == Getting FT Embedding Model ===================================================================== #
if LOAD_TEXT_MODEL:
    print("Loading text model from %s" % txt_emb_model_path)
    text_model = ft.load_model(txt_emb_model_path)
else:
    print("Preprocessing titles")
    texts = pd.concat([train_df['title_1'], train_df['title_2'], sample_df['title_1'],
                       sample_df['title_2'], extra_df['Title'], test_df['title_1'], test_df['title_2']], axis=0)
    texts = texts.reset_index().drop('index', axis=1)
    texts = texts.rename(columns={0: 'Title'})
    texts.index.name = 'pair_index'
    texts = preprocess_text_df(texts, txt_cols=['Title'])
    if USE_CLEAN_TEXT:
        texts = preprocess_text_df(texts, txt_cols['Title'], func=remove_stopwords)
    texts.to_csv(f'{output_path}/titles.txt', header=False, index=False)
    print("Unsupervised training for text model for test data")
    text_model = ft.train_unsupervised(f'{output_path}/titles.txt', minn=3, maxn=6, dim=TXT_N_FEATURES,
                                       epoch=TXT_N_EPOCH)
    print("Saving text model to %s" % txt_emb_model_path)
    text_model.save_model(txt_emb_model_path)

Loading text model from ./output/model/txt_emb.bin




In [10]:
# == Building Features DataFrame ==================================================================== #
# preprocess train and test text features
train_text_df = train_df.copy()
train_text_df = preprocess_text_df(train_text_df)
if USE_CLEAN_TEXT:
    train_text_df = preprocess_text_df(train_text_df, func=remove_stopwords)
test_text_df = test_df.copy()
test_text_df = preprocess_text_df(test_text_df)
if USE_CLEAN_TEXT:
    test_text_df = preprocess_text_df(test_text_df, func=remove_stopwords)


In [13]:
# build text, images, and histogram sample features
img_train_df = build_img_feats(train_df, test_img_feats)
text_train_df = build_text_feats(train_text_df, text_model)
X_train = pd.concat([text_train_df, img_train_df], axis=1)

Build handcraft text feats..


In [14]:
y_train = train_df['Label']

In [15]:
# build text, images, and histogram sample features
img_test_df = build_img_feats(test_df, test_img_feats)
text_test_df = build_text_feats(test_text_df, text_model)
X_test = pd.concat([text_test_df, img_test_df], axis=1)

Build handcraft text feats..


In [18]:
train_idx = load(f'train_val_index/train_idx.joblib')
val_idx = load(f'train_val_index/val_idx.joblib')

In [19]:
X_train_train = X_train.loc[train_idx]
y_train_train = y_train.loc[train_idx]

X_train_val = X_train.loc[val_idx]
y_train_val = y_train.loc[val_idx]

In [20]:
v7_model = load(f'{model_path}/lgbm_hyped_v7.joblib')

In [21]:
v7_final_feats = load(f'{model_path}/final_features_v7.joblib')

In [121]:
params = v7_model.get_params()

In [122]:
params

{'boosting_type': 'gbdt',
 'class_weight': 'balanced',
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 14,
 'min_child_samples': 20,
 'min_child_weight': 1,
 'min_split_gain': 0.0,
 'n_estimators': 450,
 'n_jobs': -1,
 'num_leaves': 64,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'boosting': 'gbdt',
 'min_sum_hessian_in_leaf': 0.01,
 'min_data_in_leaf': 80,
 'lambda_l2': 0,
 'lambda_l1': 0.01,
 'feature_fraction': 0.3,
 'bagging_freq': 5,
 'bagging_fraction': 0.9}

In [123]:
params['max_depth'] = 10
params['extra_trees'] = True
params['path_smooth'] = 0.5

In [124]:
lgbm_hyped = LGBMClassifier(**params)

In [125]:
lgbm_hyped.fit(X_train_train[v7_final_feats], y_train_train)

LGBMClassifier(bagging_fraction=0.9, bagging_freq=5, boosting='gbdt',
               boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, extra_trees=True, feature_fraction=0.3,
               importance_type='split', lambda_l1=0.01, lambda_l2=0,
               learning_rate=0.1, max_depth=10, min_child_samples=20,
               min_child_weight=1, min_data_in_leaf=80, min_split_gain=0.0,
               min_sum_hessian_in_leaf=0.01, n_estimators=450, n_jobs=-1,
               num_leaves=64, objective=None, path_smooth=0.5,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [126]:
from sklearn.metrics import f1_score
val_preds = lgbm_hyped.predict(X_train_val[v7_final_feats])
f1_score(y_train_val, val_preds, average='macro')

0.8722815130781754

### Submission

In [127]:
params

{'boosting_type': 'gbdt',
 'class_weight': 'balanced',
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_samples': 20,
 'min_child_weight': 1,
 'min_split_gain': 0.0,
 'n_estimators': 450,
 'n_jobs': -1,
 'num_leaves': 64,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'boosting': 'gbdt',
 'min_sum_hessian_in_leaf': 0.01,
 'min_data_in_leaf': 80,
 'lambda_l2': 0,
 'lambda_l1': 0.01,
 'feature_fraction': 0.3,
 'bagging_freq': 5,
 'bagging_fraction': 0.9,
 'extra_trees': True,
 'path_smooth': 0.5}

In [128]:
dump(params, f'{model_path}/final_params.joblib')

['./output/model/final_params.joblib']

In [129]:
lgbm_hyped_submission = LGBMClassifier(**params)

In [130]:
lgbm_hyped_submission.fit(X_train[v7_final_feats], y_train)

LGBMClassifier(bagging_fraction=0.9, bagging_freq=5, boosting='gbdt',
               boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, extra_trees=True, feature_fraction=0.3,
               importance_type='split', lambda_l1=0.01, lambda_l2=0,
               learning_rate=0.1, max_depth=10, min_child_samples=20,
               min_child_weight=1, min_data_in_leaf=80, min_split_gain=0.0,
               min_sum_hessian_in_leaf=0.01, n_estimators=450, n_jobs=-1,
               num_leaves=64, objective=None, path_smooth=0.5,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [131]:
# == Generate Submission File ======================================================================= #
# load LGBM model
# lgbm_model = load(lgbm_model_path)
# final_features = load(lgbm_final_features)

predictions = lgbm_hyped_submission.predict(X_test[v7_final_feats])
result = pd.DataFrame(predictions, columns=['Label'])
result.index.name = 'pair_index'
result.to_csv(f'{output_path}/submission_v5.csv')