In [33]:
import pexpect
import pandas as pd
import subprocess
import string
import nltk
from nltk.stem.snowball import SnowballStemmer

In [34]:
%cd /workspace/datasets/fasttext

/home/jupyter/.kaggle/datasets/fasttext


### Extract titles to phone_titles.txt

In [2]:
%%bash
set -x
python /workspace/search_with_machine_learning_course/week3/extractTitles.py \
    --products_df /workspace/datasets/fasttext/phone_products_df.pk \
    --output /workspace/datasets/fasttext/phone_titles.txt \
    --sample_rate 1.0
wc -l /workspace/datasets/fasttext/phone_titles.txt


+ python /workspace/search_with_machine_learning_course/week3/extractTitles.py --products_df /workspace/datasets/fasttext/phone_products_df.pk --output /workspace/datasets/fasttext/phone_titles.txt --sample_rate 1.0


Writing results to /workspace/datasets/fasttext/phone_titles.txt


+ wc -l /workspace/datasets/fasttext/phone_titles.txt


4862 /workspace/datasets/fasttext/phone_titles.txt


### Text transformation

In [35]:
translation_table = str.maketrans("", "", "®©™" + string.punctuation)

def transform_training_data(name):
    name = name.replace("\n", " ")
    name = name.lower()
    # remove punctuation
    name = name.translate(translation_table)
    tokens = nltk.word_tokenize(name)
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

### Words used for evaluation

In [36]:
words = [
    "fabshell", "hurley", "bluetrek", "energi",
    "apple", "iphone", "samsung", "motorola", "headphones",
    "galaxy", "droid", "ericsson", "microsd", "razr",
    "case", "protector", "leather", "charger", "unlocked"
]

### Evaluation

In [37]:
def run_exp(epochs=25, minCount = 5):
    # Train model
    dir = "/workspace/datasets/fasttext/"
    model = f"phone_title_model.e{epochs}.mc{minCount}"
    cmd = f"fasttext skipgram -input phone_titles.txt -output {model} -epoch {epochs} -minCount {minCount}"
    print(f"+ {cmd}")
    subprocess.run(cmd, shell=True, cwd=dir)

    # Eval on words
    rows = []
    cmd = f"fasttext nn {model}.bin"
    print(f"+ {cmd}")
    child=pexpect.spawn(cmd, encoding='utf-8')
    child.expect('Query word?')
    for input_word in words:
        word = transform_training_data(input_word)
        child.sendline(word)
        child.expect('Query word?')
        output = child.before
        lines = output.strip().split("\r\n")[1:]
        splits = [line.strip().split() for line in lines]
        row = [f"{input_word} ({word})"] + [f"{o[0]}: {float(o[1]):.3f}" for o in splits]
        rows.append(row)
    child.close()
    df = pd.DataFrame(rows)
    display(df)

#### Epochs = 25, Min Count = 5

Base model with epochs = 25 and min count = 5. As this is a pretty small dataset, it didn't feel like increasing min count any further will benefit.

In [38]:
run_exp(epochs=25)

+ fasttext skipgram -input phone_titles.txt -output phone_title_model.e25.mc5 -epoch 25 -minCount 5


Read 0M words
Number of words:  673
Number of labels: 0
Progress: 100.0% words/sec/thread:  192072 lr:  0.000000 avg.loss:  2.250995 ETA:   0h 0m 0s


+ fasttext nn phone_title_model.e25.mc5.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fabshell (fabshel),candyshel: 0.982,speck: 0.966,bumper: 0.963,hardshel: 0.958,burton: 0.957,oakley: 0.955,kapok: 0.949,harleydavidson: 0.946,teal: 0.942,appl: 0.939
1,hurley (hurley),oakley: 0.980,harleydavidson: 0.966,macbeth: 0.950,kapok: 0.940,fabshel: 0.931,tekkeon: 0.927,candyshel: 0.925,hardshel: 0.918,gaga: 0.916,4s: 0.911
2,bluetrek (bluetrek),blueant: 0.979,bluetooth: 0.974,bluetoothen: 0.966,bluetoothcompat: 0.962,discoveri: 0.918,explor: 0.914,jabra: 0.906,plantron: 0.903,era: 0.895,stereo: 0.887
3,energi (energi),energ: 0.978,emerg: 0.884,devic: 0.864,mycharg: 0.857,to: 0.854,electron: 0.848,recharg: 0.822,callpod: 0.819,gps: 0.819,usb: 0.805
4,apple (appl),fabshel: 0.939,candyshel: 0.931,iphon: 0.931,teal: 0.930,kapok: 0.926,bumper: 0.912,oakley: 0.908,hardshel: 0.905,4s: 0.905,speck: 0.903
5,iphone (iphon),appl: 0.931,4: 0.902,4s: 0.890,fabshel: 0.880,candyshel: 0.878,kapok: 0.867,tekkeon: 0.866,harleydavidson: 0.861,hardshel: 0.858,oakley: 0.856
6,samsung (samsung),ii: 0.902,galaxi: 0.896,indulg: 0.894,epic: 0.882,nexus: 0.873,focus: 0.866,graviti: 0.866,iii: 0.854,t: 0.840,status: 0.839
7,motorola (motorola),v3: 0.907,razr: 0.907,purs: 0.848,droid: 0.797,superex: 0.796,a855: 0.793,bionic: 0.790,rapid: 0.789,backflip: 0.778,innov: 0.771
8,headphones (headphon),earpollut: 0.985,earbud: 0.959,microphon: 0.939,ozon: 0.933,lux: 0.932,ifrogz: 0.924,onear: 0.909,jammin: 0.907,marley: 0.891,33: 0.874
9,galaxy (galaxi),iii: 0.936,s: 0.930,epic: 0.913,multimedia: 0.898,samsung: 0.896,ii: 0.888,captiv: 0.887,note: 0.875,nexus: 0.867,pebbl: 0.852


#### Comparing epochs = 5, 25, 50, 100

Running with 5 epochs results in all scores > .999. So, that can be safely excluded.

Between 25, 50, and 100 though, it is not clear which one to pick, and how to decide which is better. Also
even when relative order of nearest neighbors remain same, scores change a lot with no. of epochs. So threshold
score is dependent on no. of epochs.

In [39]:
run_exp(epochs=5)
run_exp(epochs=25)
run_exp(epochs=50)
run_exp(epochs=100)

+ fasttext skipgram -input phone_titles.txt -output phone_title_model.e5.mc5 -epoch 5 -minCount 5


Read 0M words
Number of words:  673
Number of labels: 0
Progress: 100.0% words/sec/thread:  183308 lr:  0.000000 avg.loss:  3.072129 ETA:   0h 0m 0s


+ fasttext nn phone_title_model.e5.mc5.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fabshell (fabshel),hardshel: 1.000,candyshel: 1.000,purpl: 1.000,multicolor: 1.000,silicon: 1.000,harleydavidson: 1.000,rocketfish: 1.000,graphit: 1.000,control: 1.000,pinkwhit: 1.000
1,hurley (hurley),marley: 1.000,harleydavidson: 1.000,rocketfish: 1.000,madden: 1.000,incred: 1.000,antenna: 1.000,multicolor: 1.000,speakerphon: 1.000,accessori: 1.000,pinkwhit: 1.000
2,bluetrek (bluetrek),bluetooth: 1.000,blueant: 1.000,bluetoothen: 1.000,blue: 1.000,bluesilv: 1.000,bluetoothcompat: 1.000,bluewhit: 1.000,blu: 1.000,univers: 1.000,station: 1.000
3,energi (energi),energ: 1.000,graphit: 1.000,madden: 1.000,multi: 1.000,multicolor: 1.000,multimedia: 1.000,accessori: 1.000,brown: 1.000,instal: 1.000,processor: 1.000
4,apple (appl),3gs: 1.000,incas: 1.000,iphon: 1.000,ipod: 1.000,3g3gs: 1.000,4s: 1.000,purpl: 1.000,stand: 1.000,macbeth: 1.000,hardshel: 1.000
5,iphone (iphon),gophon: 1.000,3g3gs: 1.000,appl: 1.000,3gs: 1.000,incas: 1.000,macbeth: 1.000,smartphon: 1.000,headphon: 1.000,3g3g: 1.000,purpl: 1.000
6,samsung (samsung),mytouch: 1.000,softtouch: 1.000,silver: 1.000,unlock: 1.000,nokia: 1.000,galaxi: 1.000,multicolor: 1.000,desktop: 1.000,solar: 1.000,multimedia: 1.000
7,motorola (motorola),nokia: 1.000,desktop: 1.000,multimedia: 1.000,softtouch: 1.000,surfac: 1.000,design: 1.000,model: 1.000,monster: 1.000,bluetoothcompat: 1.000,univers: 1.000
8,headphones (headphon),gophon: 1.000,marley: 1.000,earbud: 1.000,earpollut: 1.000,speakerphon: 1.000,on: 1.000,smartphon: 1.000,harleydavidson: 1.000,ozon: 1.000,microphon: 1.000
9,galaxy (galaxi),interfac: 1.000,nocontract: 1.000,horizont: 1.000,multicolor: 1.000,laptop: 1.000,multimedia: 1.000,instinct: 1.000,stratospher: 1.000,intercept: 1.000,internet: 1.000


+ fasttext skipgram -input phone_titles.txt -output phone_title_model.e25.mc5 -epoch 25 -minCount 5


Read 0M words
Number of words:  673
Number of labels: 0
Progress: 100.0% words/sec/thread:  192191 lr:  0.000000 avg.loss:  2.222176 ETA:   0h 0m 0s


+ fasttext nn phone_title_model.e25.mc5.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fabshell (fabshel),candyshel: 0.982,kapok: 0.971,speck: 0.971,bumper: 0.970,oakley: 0.969,hardshel: 0.962,burton: 0.961,harleydavidson: 0.959,canopi: 0.956,teal: 0.950
1,hurley (hurley),oakley: 0.982,harleydavidson: 0.969,kapok: 0.950,macbeth: 0.948,fabshel: 0.942,tekkeon: 0.929,gaga: 0.927,candyshel: 0.927,canopi: 0.925,hardshel: 0.923
2,bluetrek (bluetrek),blueant: 0.974,bluetooth: 0.971,bluetoothen: 0.968,bluetoothcompat: 0.959,discoveri: 0.918,explor: 0.913,era: 0.902,jabra: 0.901,stereo: 0.898,plantron: 0.887
3,energi (energi),energ: 0.982,devic: 0.883,emerg: 0.879,electron: 0.871,to: 0.862,mycharg: 0.858,recharg: 0.827,callpod: 0.823,r225: 0.819,data: 0.814
4,apple (appl),kapok: 0.947,iphon: 0.944,fabshel: 0.927,teal: 0.925,candyshel: 0.923,oakley: 0.918,video: 0.914,canopi: 0.913,tekkeon: 0.908,bumper: 0.906
5,iphone (iphon),appl: 0.944,4: 0.915,kapok: 0.892,4s: 0.887,fabshel: 0.885,candyshel: 0.885,oakley: 0.876,tekkeon: 0.873,canopi: 0.866,bumper: 0.866
6,samsung (samsung),epic: 0.874,ii: 0.874,galaxi: 0.870,nexus: 0.870,indulg: 0.867,note: 0.865,graviti: 0.864,vibrant: 0.831,googl: 0.829,focus: 0.814
7,motorola (motorola),v3: 0.900,razr: 0.885,purs: 0.829,a855: 0.780,bionic: 0.775,cellular: 0.773,superex: 0.771,vehicl: 0.770,innov: 0.769,carri: 0.766
8,headphones (headphon),earpollut: 0.982,earbud: 0.959,microphon: 0.940,ozon: 0.924,ifrogz: 0.921,lux: 0.920,jammin: 0.910,marley: 0.899,onear: 0.897,hous: 0.887
9,galaxy (galaxi),iii: 0.952,s: 0.927,captiv: 0.902,epic: 0.900,ii: 0.900,note: 0.892,pebbl: 0.874,nexus: 0.870,samsung: 0.870,multimedia: 0.868


+ fasttext skipgram -input phone_titles.txt -output phone_title_model.e50.mc5 -epoch 50 -minCount 5


Read 0M words
Number of words:  673
Number of labels: 0
Progress: 100.0% words/sec/thread:  192262 lr:  0.000000 avg.loss:  1.872594 ETA:   0h 0m 0s


+ fasttext nn phone_title_model.e50.mc5.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fabshell (fabshel),speck: 0.938,burton: 0.938,candyshel: 0.905,plaid: 0.846,macbeth: 0.833,bumper: 0.824,hardshel: 0.807,puma: 0.807,frost: 0.805,tekkeon: 0.804
1,hurley (hurley),oakley: 0.914,macbeth: 0.902,gaga: 0.880,harleydavidson: 0.874,ladi: 0.843,venom: 0.843,tekkeon: 0.832,antibacteri: 0.824,fabshel: 0.802,puma: 0.768
2,bluetrek (bluetrek),bluetoothcompat: 0.918,bluetoothen: 0.898,bluetooth: 0.891,blueant: 0.879,headset: 0.803,discoveri: 0.776,era: 0.769,blu: 0.769,jabra: 0.768,midnight: 0.741
3,energi (energi),energ: 0.945,go: 0.805,electron: 0.790,credit: 0.772,emerg: 0.771,to: 0.767,mycharg: 0.755,edit: 0.724,broadband2go: 0.711,bank: 0.705
4,apple (appl),iphon: 0.874,4: 0.829,4s: 0.791,3gs: 0.725,kapok: 0.695,tekkeon: 0.664,canopi: 0.658,teal: 0.648,gaga: 0.613,burton: 0.611
5,iphone (iphon),appl: 0.874,4s: 0.820,4: 0.815,3gs: 0.751,kapok: 0.702,tekkeon: 0.691,canopi: 0.669,teal: 0.652,3g3gs: 0.648,incas: 0.626
6,samsung (samsung),galaxi: 0.706,ii: 0.633,iii: 0.600,s: 0.583,indulg: 0.580,nexus: 0.573,epic: 0.560,etern: 0.548,intens: 0.541,sphm850: 0.537
7,motorola (motorola),razr: 0.734,droid: 0.683,v3: 0.678,purs: 0.619,photon: 0.609,a855: 0.598,mb200: 0.583,x2: 0.581,atrix: 0.576,x: 0.552
8,headphones (headphon),earpollut: 0.923,microphon: 0.883,earbud: 0.858,jammin: 0.850,hous: 0.838,marley: 0.824,ifrogz: 0.811,lux: 0.810,ozon: 0.807,33: 0.780
9,galaxy (galaxi),iii: 0.851,s: 0.831,note: 0.816,captiv: 0.809,ii: 0.769,epic: 0.760,nexus: 0.747,marbl: 0.742,indulg: 0.734,pebbl: 0.712


+ fasttext skipgram -input phone_titles.txt -output phone_title_model.e100.mc5 -epoch 100 -minCount 5


Read 0M words
Number of words:  673
Number of labels: 0
Progress: 100.0% words/sec/thread:  191604 lr:  0.000000 avg.loss:  1.462466 ETA:   0h 0m 0s


+ fasttext nn phone_title_model.e100.mc5.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fabshell (fabshel),burton: 0.921,speck: 0.879,candyshel: 0.829,plaid: 0.793,hurley: 0.735,venom: 0.701,tekkeon: 0.689,lean: 0.687,hardshel: 0.683,puma: 0.677
1,hurley (hurley),oakley: 0.838,harleydavidson: 0.812,gaga: 0.810,venom: 0.801,macbeth: 0.791,antibacteri: 0.786,ladi: 0.770,marley: 0.741,fabshel: 0.735,puma: 0.705
2,bluetrek (bluetrek),bluetoothcompat: 0.798,bluetoothen: 0.790,blueant: 0.789,bluetooth: 0.745,discoveri: 0.712,blu: 0.700,headset: 0.664,era: 0.660,walkman: 0.652,stereo: 0.612
3,energi (energi),energ: 0.895,go: 0.756,electron: 0.745,credit: 0.740,to: 0.708,emerg: 0.690,mycharg: 0.663,bank: 0.630,broadband2go: 0.625,medi: 0.618
4,apple (appl),iphon: 0.832,4: 0.665,4s: 0.663,3gs: 0.615,kapok: 0.572,teal: 0.566,incas: 0.545,ifrogz: 0.534,ipod: 0.531,canopi: 0.526
5,iphone (iphon),appl: 0.832,4: 0.693,4s: 0.660,3gs: 0.617,kapok: 0.588,teal: 0.564,tekkeon: 0.549,canopi: 0.545,incas: 0.532,burton: 0.531
6,samsung (samsung),galaxi: 0.593,iii: 0.447,ii: 0.443,lg: 0.435,sanyo: 0.410,indulg: 0.398,pebbl: 0.393,marbl: 0.389,epic: 0.388,graviti: 0.387
7,motorola (motorola),razr: 0.593,droid: 0.539,purs: 0.508,v3: 0.508,photon: 0.471,krzr: 0.456,devour: 0.454,a855: 0.453,mb200: 0.450,atrix: 0.437
8,headphones (headphon),earpollut: 0.853,microphon: 0.812,earbud: 0.805,jammin: 0.785,marley: 0.780,hous: 0.774,onear: 0.685,ozon: 0.660,of: 0.650,lux: 0.649
9,galaxy (galaxi),s: 0.652,iii: 0.647,note: 0.647,epic: 0.624,captiv: 0.597,marbl: 0.594,samsung: 0.593,nexus: 0.544,indulg: 0.543,googl: 0.520


#### Min Count = {10, 20}, Epochs = 25

In [40]:
run_exp(epochs = 25, minCount=10)
run_exp(epochs = 25, minCount=20)

+ fasttext skipgram -input phone_titles.txt -output phone_title_model.e25.mc10 -epoch 25 -minCount 10


Read 0M words
Number of words:  354
Number of labels: 0
Progress: 100.0% words/sec/thread:  239997 lr:  0.000000 avg.loss:  2.305708 ETA:   0h 0m 0s


+ fasttext nn phone_title_model.e25.mc10.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fabshell (fabshel),soft: 0.922,atrix: 0.914,shell: 0.910,shift: 0.909,ballist: 0.897,fascin: 0.889,softtouch: 0.887,gel: 0.884,rocketfish: 0.883,6300: 0.882
1,hurley (hurley),oakley: 0.970,appl: 0.935,4: 0.934,macbeth: 0.929,reveal: 0.928,4s: 0.922,teal: 0.917,metal: 0.912,speck: 0.907,lifeproof: 0.905
2,bluetrek (bluetrek),blueant: 0.981,bluetooth: 0.966,bluetoothen: 0.956,plantron: 0.932,headset: 0.921,explor: 0.896,jabra: 0.885,jawbon: 0.884,prime: 0.883,bud: 0.881
3,energi (energi),energ: 0.991,port: 0.930,dual: 0.924,power: 0.913,devic: 0.899,portabl: 0.875,igo: 0.875,system: 0.868,charg: 0.858,extern: 0.856
4,apple (appl),oakley: 0.976,teal: 0.971,lifeproof: 0.957,incas: 0.952,slider: 0.948,4: 0.945,macbeth: 0.941,speck: 0.931,iphon: 0.928,4s: 0.921
5,iphone (iphon),appl: 0.928,lux: 0.914,4: 0.903,teal: 0.897,ifrogz: 0.891,oakley: 0.888,4s: 0.886,lifeproof: 0.883,incas: 0.883,3gs: 0.875
6,samsung (samsung),galaxi: 0.912,s: 0.879,vibrant: 0.854,ii: 0.852,captiv: 0.843,note: 0.836,epic: 0.828,multimedia: 0.816,iii: 0.814,mount: 0.812
7,motorola (motorola),v3: 0.911,station: 0.824,razr: 0.820,droid: 0.806,x: 0.791,photon: 0.781,vehicl: 0.777,bionic: 0.773,desktop: 0.769,xperia: 0.759
8,headphones (headphon),earbud: 0.981,microphon: 0.965,ozon: 0.941,ifrogz: 0.938,lux: 0.931,of: 0.914,iphon: 0.791,with: 0.788,bud: 0.787,micro: 0.774
9,galaxy (galaxi),s: 0.947,iii: 0.943,ii: 0.926,captiv: 0.915,samsung: 0.912,note: 0.876,epic: 0.868,vibrant: 0.842,nexus: 0.840,graviti: 0.813


+ fasttext skipgram -input phone_titles.txt -output phone_title_model.e25.mc20 -epoch 25 -minCount 20


Read 0M words
Number of words:  211
Number of labels: 0
Progress: 100.0% words/sec/thread:  319895 lr:  0.000000 avg.loss:  2.529816 ETA:   0h 0m 0s


+ fasttext nn phone_title_model.e25.mc20.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fabshell (fabshel),gel: 0.945,shell: 0.942,communic: 0.917,silicon: 0.906,incred: 0.900,commut: 0.900,sleev: 0.900,pink: 0.893,case: 0.893,superior: 0.891
1,hurley (hurley),car: 0.146,handsfre: 0.139,mini: 0.137,micro: 0.137,jabra: 0.137,plantron: 0.131,igo: 0.131,adapt: 0.130,headset: 0.128,speakerphon: 0.125
2,bluetrek (bluetrek),bluetooth: 0.966,bluetoothen: 0.958,plantron: 0.953,headset: 0.947,silver: 0.920,jabra: 0.915,jawbon: 0.915,blacksilv: 0.914,speakerphon: 0.912,jack: 0.909
3,energi (energi),virgin: 0.989,50: 0.974,airtim: 0.970,topup: 0.966,prepaid: 0.961,20: 0.955,cricket: 0.955,net10: 0.947,card: 0.902,nocontract: 0.902
4,apple (appl),incas: 0.964,4: 0.963,slider: 0.962,4s: 0.962,3gs: 0.956,speck: 0.948,griffin: 0.942,dlo: 0.924,iphon: 0.915,3g: 0.910
5,iphone (iphon),lux: 0.929,speck: 0.921,ifrogz: 0.916,appl: 0.915,4: 0.910,incas: 0.903,3gs: 0.901,4s: 0.894,slider: 0.891,fit: 0.868
6,samsung (samsung),mount: 0.892,galaxi: 0.891,ii: 0.887,dock: 0.847,nexus: 0.845,s: 0.845,pantech: 0.837,lg: 0.829,iii: 0.807,desktop: 0.796
7,motorola (motorola),razr: 0.872,mount: 0.837,desktop: 0.831,nextel: 0.826,sanyo: 0.810,innov: 0.808,dock: 0.807,jensen: 0.804,travel: 0.804,just: 0.799
8,headphones (headphon),earbud: 0.989,microphon: 0.953,ifrogz: 0.919,lux: 0.875,iphon: 0.849,plantron: 0.801,with: 0.800,jawbon: 0.793,speakerphon: 0.792,micro: 0.788
9,galaxy (galaxi),iii: 0.969,s: 0.961,ii: 0.939,4g: 0.917,samsung: 0.891,nexus: 0.881,sprint: 0.846,pantech: 0.839,epic: 0.830,graviti: 0.807
