In [1]:
%cd /workspace/datasets/fasttext

/home/jupyter/.kaggle/datasets/fasttext


## Function to run category training experiment

Keeps each run in its own directory depending on model params.

In [2]:
def run_exp(run_subdir = None, epochs=5, lr=0.1, ngrams=1, transform=False, min_products=0, max_depth=0):
    import subprocess
    from pathlib import Path

    if run_subdir is None:
        run_subdir = f"run.e{epochs}.lr{lr}.ng{ngrams}.tr{int(transform)}.mp{min_products}.md{max_depth}"
    run_dir = Path("/workspace/datasets/fasttext/") / run_subdir

    script = "/workspace/search_with_machine_learning_course/week3/createContentTrainingData.py"
    cmd1 = f"python {script} prepare-input --run_subdir {run_subdir} --transform {transform} --min_products {min_products} --max_depth {max_depth}"
    print(f"\n+ {cmd1}")
    subprocess.run(cmd1, shell=True)

    cmd2 = f"fasttext supervised -input products.train -output model -epoch {epochs} -lr {lr} -wordNgrams {ngrams}"
    print(f"\n+ {cmd2}")
    subprocess.run(cmd2, shell=True, cwd=run_dir)

    cmd3 = f"fasttext test model.bin products.train"
    print(f"\n+ {cmd3}")
    subprocess.run(cmd3, shell=True, cwd=run_dir)

    cmd4 = f"fasttext test model.bin products.test"
    print(f"\n+ {cmd4}")
    subprocess.run(cmd4, shell=True, cwd=run_dir)

    return run_dir


### Base run with no parameters and no text preprocessing

Precision on training data itself is < 15%. That suggest severe undertraining

In [3]:
run_exp(run_subdir=".")


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir . --transform False --min_products 0 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 0: keeping 1952/1952 categories and 115358/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/products.train
Writing 10000 rows to /workspace/datasets/fasttext/products.test

+ fasttext supervised -input products.train -output model -epoch 5 -lr 0.1 -wordNgrams 1


Read 0M words
Number of words:  11212
Number of labels: 1358
Progress: 100.0% words/sec/thread:   13726 lr:  0.000000 avg.loss: 13.185384 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.135
R@1	0.135

+ fasttext test model.bin products.test
N	9705
P@1	0.141
R@1	0.141


PosixPath('/workspace/datasets/fasttext')

## Experiments

#### Epochs = 25

In [4]:
run_exp(epochs=25)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr0.1.ng1.tr0.mp0.md0 --transform False --min_products 0 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 0: keeping 1952/1952 categories and 115358/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr0.1.ng1.tr0.mp0.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr0.1.ng1.tr0.mp0.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 0.1 -wordNgrams 1


Read 0M words
Number of words:  11212
Number of labels: 1358
Progress: 100.0% words/sec/thread:   13633 lr:  0.000000 avg.loss:  6.268315 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.524
R@1	0.524

+ fasttext test model.bin products.test
N	9705
P@1	0.458
R@1	0.458


PosixPath('/workspace/datasets/fasttext/run.e25.lr0.1.ng1.tr0.mp0.md0')

#### Epochs = 25, LR = 1

In [5]:
run_exp(epochs=25, lr=1.0)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng1.tr0.mp0.md0 --transform False --min_products 0 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 0: keeping 1952/1952 categories and 115358/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng1.tr0.mp0.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng1.tr0.mp0.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 1


Read 0M words
Number of words:  11212
Number of labels: 1358
Progress: 100.0% words/sec/thread:   13722 lr:  0.000000 avg.loss:  1.017911 ETA:   0h 0m 0s  2.3% words/sec/thread:   13559 lr:  0.976711 avg.loss: 13.188928 ETA:   0h 0m16s



+ fasttext test model.bin products.train
N	10000
P@1	0.995
R@1	0.995

+ fasttext test model.bin products.test
N	9705
P@1	0.629
R@1	0.629


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng1.tr0.mp0.md0')

#### Epochs=25, LR=1.0, Bigrams

In [6]:
run_exp(epochs=25, lr=1.0, ngrams=2)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr0.mp0.md0 --transform False --min_products 0 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 0: keeping 1952/1952 categories and 115358/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr0.mp0.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr0.mp0.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  11212
Number of labels: 1358
Progress: 100.0% words/sec/thread:   13796 lr:  0.000000 avg.loss:  1.294458 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.995
R@1	0.995

+ fasttext test model.bin products.test
N	9705
P@1	0.613
R@1	0.613


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr0.mp0.md0')

#### Epochs=25, LR=1.0, Bigrams, Analyzer

In [7]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp0.md0 --transform True --min_products 0 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 0: keeping 1952/1952 categories and 115358/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp0.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp0.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  9370
Number of labels: 1358
Progress: 100.0% words/sec/thread:   12011 lr:  0.000000 avg.loss:  1.140114 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.995
R@1	0.995

+ fasttext test model.bin products.test
N	9705
P@1	0.623
R@1	0.623


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp0.md0')

#### Epochs=25, LR=1.0, Bigrams, Analyzer, Min Products={50, 100, 150, 200}

In [8]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=50)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp50.md0 --transform True --min_products 50 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 50: keeping 520/1952 categories and 93126/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  8962
Number of labels: 520
Progress: 100.0% words/sec/thread:   29461 lr:  0.000000 avg.loss:  0.618399 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.994
R@1	0.994

+ fasttext test model.bin products.test
N	10000
P@1	0.737
R@1	0.737


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md0')

In [9]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=100)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp100.md0 --transform True --min_products 100 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 100: keeping 269/1952 categories and 75501/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp100.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp100.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  8360
Number of labels: 269
Progress: 100.0% words/sec/thread:   55138 lr:  0.000000 avg.loss:  0.306822 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.994
R@1	0.994

+ fasttext test model.bin products.test
N	10000
P@1	0.814
R@1	0.814


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp100.md0')

In [10]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=150)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp150.md0 --transform True --min_products 150 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 150: keeping 168/1952 categories and 63188/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp150.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp150.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  7885
Number of labels: 168
Progress: 100.0% words/sec/thread:   85334 lr:  0.000000 avg.loss:  0.221674 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.992
R@1	0.992

+ fasttext test model.bin products.test
N	10000
P@1	0.847
R@1	0.847


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp150.md0')

In [11]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=200)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp200.md0 --transform True --min_products 200 --max_depth 0
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 200: keeping 113/1952 categories and 53654/115358 rows.
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp200.md0/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp200.md0/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  7390
Number of labels: 113
Progress: 100.0% words/sec/thread:  120549 lr:  0.000000 avg.loss:  0.147550 ETA:   0h 0m 0s100.0% words/sec/thread:  120554 lr: -0.000095 avg.loss:  0.147550 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.991
R@1	0.991

+ fasttext test model.bin products.test
N	10000
P@1	0.89
R@1	0.89


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp200.md0')

#### Epochs=25, LR=1.0, Bigrams, Analyzer, Min Products=50, Max Depth={1, 2, 3, 4}

In [12]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=50, max_depth=1)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp50.md1 --transform True --min_products 50 --max_depth 1
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 50: keeping 520/1952 categories and 93126/115358 rows.
max_depth=1: categories pruned from 508 to 1
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md1/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md1/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  8900
Number of labels: 1
Progress: 100.0% words/sec/thread:  690027 lr:  0.000000 avg.loss:  0.000000 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	1
R@1	1

+ fasttext test model.bin products.test
N	10000
P@1	1
R@1	1


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md1')

In [13]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=50, max_depth=2)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp50.md2 --transform True --min_products 50 --max_depth 2
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 50: keeping 520/1952 categories and 93126/115358 rows.
max_depth=2: categories pruned from 508 to 18
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md2/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md2/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  8900
Number of labels: 18
Progress: 100.0% words/sec/thread:  345634 lr:  0.000000 avg.loss:  0.059191 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.999
R@1	0.999

+ fasttext test model.bin products.test
N	10000
P@1	0.936
R@1	0.936


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md2')

In [14]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=50, max_depth=3)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp50.md3 --transform True --min_products 50 --max_depth 3
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 50: keeping 520/1952 categories and 93126/115358 rows.
max_depth=3: categories pruned from 508 to 110
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md3/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md3/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  8900
Number of labels: 110
Progress: 100.0% words/sec/thread:  114345 lr:  0.000000 avg.loss:  0.075623 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.999
R@1	0.999

+ fasttext test model.bin products.test
N	10000
P@1	0.915
R@1	0.915


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md3')

In [15]:
run_exp(epochs=25, lr=1.0, ngrams=2, transform=True, min_products=50, max_depth=4)


+ python /workspace/search_with_machine_learning_course/week3/createContentTrainingData.py prepare-input --run_subdir run.e25.lr1.0.ng2.tr1.mp50.md4 --transform True --min_products 50 --max_depth 4
Reading df from /workspace/datasets/fasttext/pruned_products_df.pk
min_products = 50: keeping 520/1952 categories and 93126/115358 rows.
max_depth=4: categories pruned from 508 to 302
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md4/products.train
Writing 10000 rows to /workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md4/products.test

+ fasttext supervised -input products.train -output model -epoch 25 -lr 1.0 -wordNgrams 2


Read 0M words
Number of words:  8900
Number of labels: 302
Progress: 100.0% words/sec/thread:   48294 lr:  0.000000 avg.loss:  0.308843 ETA:   0h 0m 0s



+ fasttext test model.bin products.train
N	10000
P@1	0.998
R@1	0.998

+ fasttext test model.bin products.test
N	10000
P@1	0.853
R@1	0.853


PosixPath('/workspace/datasets/fasttext/run.e25.lr1.0.ng2.tr1.mp50.md4')