In [1]:
import deepmatcher as dm
import torch

In [2]:
# Process data. Downloads word vectors if necessary. Note that this can take several minutes.
train, validation, test = dm.process(
    path='sample_data',
    train='amz_goog_train.csv',
    validation='amz_goog_validation.csv',
    test='amz_goog_test.csv',
    ignore_columns=('left_id', 'right_id'))

In [3]:
# Create my own Attribute Comparator instead of using built-in options.
def my_attr_comparator(left, right):
    return torch.cat((left,
                      right,
                      torch.abs(left - right),
                      left * right), left.dim() - 1)

# Construct fancy DL model for EM.
custom_model = dm.MatchingModel(
    attr_summarizer=dm.attr_summarizers.Hybrid(
        word_contextualizer=dm.word_contextualizers.RNN(unit_type='LSTM',
                                                        layers=2,
                                                        dropout=0.2,
                                                        bypass_network='highway'),
        word_comparator=dm.word_comparators.Attention(
            input_dropout=0.3,
            alignment_network=dm.modules.AlignmentNetwork(
                    style='bilinear',
                    transform_network='2-layer-residual-tanh'),
            comparison_network=dm.modules.Transform('3-layer-highway',
                                                    hidden_size=400,
                                                    output_size=200)),
        word_aggregator=dm.word_aggregators.AttentionWithRNN(
            input_dropout=0.2,
            rnn=dm.modules.RNN(unit_type='GRU',
                               layers=3,
                               dropout=0.1,
                               bypass_network='residual'),
            rnn_pool_style='max')),
    attr_comparator=lambda: dm.modules.Lambda(my_attr_comparator),
    classifier=dm.Classifier(transform_network='1-layer-residual-glu',
                             hidden_size=300),
    finetune_embeddings=True)

# Train fancy DL model for EM.
custom_model.run_train(
    train,
    validation,
    batch_size=16,
    epochs=10,
    best_save_path='custom_model.pth',
    pos_weight=1.3)

* Number of trainable parameters: 15838055
===>  TRAIN Epoch 1 :


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:52


Finished Epoch 1 || Run Time:   50.0 | Load Time:    2.5 || F1:  12.08 | Prec:  43.75 | Rec:   7.01 || Ex/s: 130.86

===>  EVAL Epoch 1 :


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:07


Finished Epoch 1 || Run Time:    6.8 | Load Time:    0.8 || F1:  27.40 | Prec:  38.17 | Rec:  21.37 || Ex/s: 302.11

* Best F1: 27.397260273972602
Saving best model...


OSError: [Errno 122] Disk quota exceeded

In [None]:
custom_model.load_state('custom_model.pth')
custom_model.run_eval(test)