In [83]:
import pexpect
import pandas as pd
import subprocess
import string
import nltk
from nltk.stem.snowball import SnowballStemmer

In [2]:
%cd /workspace/datasets/fasttext

/home/jupyter/.kaggle/datasets/fasttext


## Extract titles

Following code for transformation is used:

In [84]:
translation_table = str.maketrans("", "", string.punctuation)

def transform_training_data(name):
    name = name.replace("\n", " ")
    name = name.lower()
    # remove punctuation
    name = name.translate(translation_table)
    tokens = nltk.word_tokenize(name)
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

In [67]:
%%bash
set -x
python /workspace/search_with_machine_learning_course/week3/extractTitles.py --sample_rate 1.0
wc -l /workspace/datasets/fasttext/titles.txt


+ python /workspace/search_with_machine_learning_course/week3/extractTitles.py --sample_rate 1.0


Writing results to /workspace/datasets/fasttext/titles.txt


+ wc -l /workspace/datasets/fasttext/titles.txt


115358 /workspace/datasets/fasttext/titles.txt


## Words used for evaluation

In [90]:
words = [
    "Printer", "Vaccum", "Headphones", "Binoculars", "Phones", 
    "Apple", "AMD", "Bosch", "Canon", "Skullcandy",
    "FinePix", "Macbook", "Wii", "Aspire", "Ipad",
    "16GB", "Black", "Bonus", "Wireless", "USB", 
]

## Evaluation

In [91]:
def run_exp(minCount = 0):
    # Train model
    dir = "/workspace/datasets/fasttext/"
    model = f"title_model.mc{minCount}"
    cmd = f"fasttext skipgram -input titles.txt -output {model} -minCount {minCount}"
    print(f"+ {cmd}")
    subprocess.run(cmd, shell=True, cwd=dir)

    # Eval on words
    rows = []
    cmd = f"fasttext nn {model}.bin"
    print(f"+ {cmd}")
    child=pexpect.spawn(cmd, encoding='utf-8')
    child.expect('Query word?')
    for input_word in words:
        word = transform_training_data(input_word)
        child.sendline(word)
        child.expect('Query word?')
        output = child.before
        lines = output.strip().split("\r\n")[1:]
        splits = [line.strip().split() for line in lines]
        row = [f"{input_word} ({word})"] + [f"{o[0]}: {float(o[1]):.3f}" for o in splits]
        rows.append(row)
    child.close()
    df = pd.DataFrame(rows)
    display(df)

### minCount = 0

In [92]:
run_exp(minCount=0)

+ fasttext skipgram -input titles.txt -output title_model.mc0 -minCount 0


Read 1M words
Number of words:  28622
Number of labels: 0
Progress: 100.0% words/sec/thread:   49836 lr:  0.000000 avg.loss:  1.527039 ETA:   0h 0m 0s words/sec/thread:   49837 lr: -0.000004 avg.loss:  1.527039 ETA:   0h 0m 0s


+ fasttext nn title_model.mc0.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),sprinter: 0.903,printerphotocaptur: 0.881,printercopierscann: 0.870,printercopierscannerfax: 0.866,printz: 0.866,printercopierscannerfaxphotocaptur: 0.849,mfc7860dw: 0.845,mfc7440n: 0.840,mfc9010cn: 0.840,mfc8480dn: 0.839
1,Vaccum (vaccum),vac™: 0.925,liftoff®: 0.901,vacuumblowerinfl: 0.899,vac: 0.894,liftoff: 0.890,cv2: 0.887,selfpropel: 0.874,febrez: 0.870,foodsav: 0.869,windtunnel™: 0.868
2,Headphones (headphon),headphoni: 0.969,headphonesear: 0.942,earbud: 0.902,overtheear: 0.888,earbudmicrophon: 0.884,multiheadphon: 0.874,behindtheear: 0.859,yurbud: 0.857,undertheear: 0.846,earphon: 0.842
3,Binoculars (binocular),monocular: 0.911,ab10592: 0.877,ab10572: 0.865,ab10526: 0.859,ab10762: 0.858,ab10594: 0.857,ab10768: 0.852,ab10286: 0.850,ab10962: 0.849,ab10176: 0.847
4,Phones (phone),vphone: 0.957,phonem: 0.947,phonemp3: 0.896,phoneunlock: 0.892,phono: 0.857,phonefaxmodemansw: 0.852,cellphon: 0.845,phonic: 0.829,kyocera: 0.824,4g: 0.821
5,Apple (appl),apple­: 0.908,appleâ®: 0.848,applecreek: 0.846,apple®: 0.843,applesc: 0.843,applework: 0.811,app: 0.809,applica: 0.801,applic: 0.798,appleiphon: 0.797
6,AMD (amd),athlon™: 0.929,phenom™: 0.917,athlonâ¢: 0.915,turion™: 0.913,phenomâ¢: 0.907,phenom®: 0.905,rm72: 0.903,tk53: 0.902,sempron™: 0.901,wamd: 0.901
7,Bosch (bosch),ascenta: 0.842,axxi: 0.837,steamdishwash: 0.813,talltub: 0.797,tub: 0.796,2378: 0.796,dishwash: 0.790,boseâ®: 0.789,50lb: 0.782,builtindishwash: 0.781
8,Canon (canon),icanon: 0.945,cannon: 0.845,fujinon: 0.829,50d: 0.826,eo: 0.810,t4i: 0.810,60d: 0.809,t1i: 0.803,t2i550d: 0.801,eos40d: 0.801
9,Skullcandy (skullcandi),skullz: 0.947,skulli: 0.924,skull: 0.923,skullcrush: 0.909,sku: 0.860,earbud: 0.833,inkd: 0.823,yurbud: 0.804,headphoni: 0.798,gumi: 0.790


### minCount = 10

In [93]:
run_exp(minCount=10)

+ fasttext skipgram -input titles.txt -output title_model.mc10 -minCount 10


Read 1M words
Number of words:  6048
Number of labels: 0
Progress: 100.0% words/sec/thread:   55999 lr:  0.000000 avg.loss:  1.415825 ETA:   0h 0m 0s


+ fasttext nn title_model.mc10.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),printercopierscann: 0.860,printercopierscannerfax: 0.860,officejet: 0.818,copier: 0.797,inkjet: 0.791,deskjet: 0.786,pixma: 0.782,hl2170w: 0.781,print: 0.770,laserjet: 0.765
1,Vaccum (vaccum),vacmast: 0.846,vac: 0.842,vacuum: 0.840,foodsav: 0.830,vax: 0.823,windtunnel: 0.816,bagless: 0.806,filtret: 0.801,canist: 0.799,ironsatin: 0.794
2,Headphones (headphon),earbud: 0.881,overtheear: 0.871,yurbud: 0.836,overthehead: 0.791,noiseisol: 0.780,earphon: 0.766,behindtheneck: 0.761,noisecancel: 0.754,behindthehead: 0.742,hesh: 0.733
3,Binoculars (binocular),scope: 0.787,barska: 0.786,bushnel: 0.763,pentax: 0.706,2060: 0.705,circular: 0.691,popular: 0.660,celestron: 0.655,altazimuth: 0.635,kx: 0.618
4,Phones (phone),phono: 0.804,gophon: 0.780,huawei: 0.775,kyocera: 0.760,att: 0.755,smartphon: 0.750,xperia: 0.750,captiv: 0.743,tmobil: 0.742,palm: 0.735
5,Apple (appl),appleâ®: 0.814,apple®: 0.805,app: 0.714,moshi: 0.646,aviiq: 0.630,3gs: 0.629,kb: 0.623,aiwa: 0.619,bookendz: 0.619,ipod®hd: 0.618
6,AMD (amd),athlon™: 0.904,phenom™: 0.898,sempron™: 0.881,tl60: 0.880,turion™: 0.877,quadcor: 0.858,3gb: 0.851,dualcor: 0.835,x6: 0.833,pentium®: 0.829
7,Bosch (bosch),ascenta: 0.846,integra: 0.754,67: 0.731,accubak: 0.730,maytag: 0.726,2378: 0.725,tassimo: 0.722,50lb: 0.721,blackonstainless: 0.720,woodlik: 0.720
8,Canon (canon),eo: 0.791,60d: 0.771,50d: 0.764,7d: 0.764,t2i: 0.757,pixma: 0.756,t1i: 0.753,xs: 0.753,hfr10: 0.752,5d: 0.746
9,Skullcandy (skullcandi),skull: 0.907,inkd: 0.792,hesh: 0.789,earbud: 0.775,lowrid: 0.755,smokin: 0.746,headphon: 0.731,gumi: 0.730,isol: 0.727,aviat: 0.718


### minCount = 20

In [94]:
run_exp(minCount=20)

+ fasttext skipgram -input titles.txt -output title_model.mc20 -minCount 20


Read 1M words
Number of words:  3919
Number of labels: 0
Progress: 100.0% words/sec/thread:   58111 lr:  0.000000 avg.loss:  1.453215 ETA:   0h 0m 0s


+ fasttext nn title_model.mc20.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),printercopierscann: 0.848,deskjet: 0.784,officejet: 0.777,eallinon: 0.774,copier: 0.767,pixma: 0.760,scanjet: 0.756,inkjet: 0.748,laserjet: 0.743,s51204: 0.737
1,Vaccum (vaccum),vacuum: 0.815,vac: 0.792,foodsav: 0.791,filtret: 0.782,canist: 0.778,bagless: 0.778,eureka: 0.752,windtunnel: 0.752,hoover: 0.722,irobot: 0.720
2,Headphones (headphon),earbud: 0.868,overtheear: 0.859,earphon: 0.797,noiseisol: 0.757,noisecancel: 0.750,bud: 0.740,behindtheneck: 0.726,hesh: 0.719,skullcandi: 0.711,2xl: 0.711
3,Binoculars (binocular),barska: 0.809,scope: 0.791,bushnel: 0.739,celestron: 0.689,mead: 0.646,pentax: 0.636,circular: 0.631,deterg: 0.580,146mp: 0.578,waterproof: 0.577
4,Phones (phone),gophon: 0.793,kyocera: 0.779,att: 0.763,razr: 0.761,8530: 0.753,8520: 0.752,4g: 0.748,motorola: 0.748,verizon: 0.745,smartphon: 0.744
5,Apple (appl),appleâ®: 0.774,apple®: 0.742,3gs: 0.643,hurley: 0.631,moshi: 0.620,mophi: 0.603,3g: 0.592,3g3gs: 0.585,iphon: 0.571,ipod: 0.571
6,AMD (amd),athlon™: 0.872,phenom™: 0.866,turion™: 0.853,sempron™: 0.845,tl60: 0.833,quadcor: 0.819,3gb: 0.809,x4: 0.808,x2: 0.793,dualcor: 0.771
7,Bosch (bosch),ascenta: 0.812,67: 0.753,integra: 0.751,2378: 0.704,tassimo: 0.699,blackonstainless: 0.686,dishwash: 0.686,architect: 0.680,whirlpool: 0.677,washerdry: 0.671
8,Canon (canon),eo: 0.797,40d: 0.786,50d: 0.785,60d: 0.775,7d: 0.773,pixma: 0.771,xsi: 0.765,ef: 0.762,t2i: 0.761,xs: 0.759
9,Skullcandy (skullcandi),inkd: 0.807,hesh: 0.800,earbud: 0.793,lowrid: 0.775,smokin: 0.772,touch®: 0.746,bud: 0.744,nano®: 0.712,headphon: 0.711,ipod: 0.711


### minCount = 50

In [95]:
run_exp(minCount=50)

+ fasttext skipgram -input titles.txt -output title_model.mc50 -minCount 50


Read 1M words
Number of words:  2167
Number of labels: 0
Progress: 100.0% words/sec/thread:   68669 lr:  0.000000 avg.loss:  1.446406 ETA:   0h 0m 0s


+ fasttext nn title_model.mc50.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),officejet: 0.799,inkjet: 0.797,pixma: 0.786,copier: 0.767,networkreadi: 0.740,laserjet: 0.735,print: 0.721,fax: 0.715,photosmart: 0.712,lexmark: 0.712
1,Vaccum (vaccum),vacuum: 0.866,vac: 0.857,bagless: 0.832,canist: 0.813,eureka: 0.784,wetdri: 0.768,hoover: 0.767,filtret: 0.757,bissel: 0.714,hepa: 0.700
2,Headphones (headphon),earbud: 0.859,overtheear: 0.834,noisecancel: 0.754,bud: 0.734,akg: 0.709,ear: 0.700,inear: 0.699,skullcandi: 0.694,clipon: 0.673,microphon: 0.664
3,Binoculars (binocular),scope: 0.799,bushnel: 0.781,barska: 0.780,celestron: 0.730,pentax: 0.683,spot: 0.610,telescop: 0.599,x: 0.583,zoom: 0.560,waterproof: 0.560
4,Phones (phone),razr: 0.750,kyocera: 0.733,pantech: 0.732,4g: 0.730,att: 0.716,ericsson: 0.712,nokia: 0.708,tmobil: 0.705,cellstar: 0.705,smartphon: 0.684
5,Apple (appl),apple®: 0.774,3gs: 0.600,ipod®: 0.545,iphon: 0.542,moshi: 0.530,iphone®: 0.529,3g: 0.527,macbook®: 0.510,squier®: 0.506,ipod: 0.502
6,AMD (amd),athlon™: 0.911,turion™: 0.865,phenom™: 0.848,3gb: 0.825,x4: 0.793,x2: 0.792,pentium®: 0.785,dualcor: 0.762,msi: 0.758,64: 0.730
7,Bosch (bosch),67: 0.710,maytag: 0.676,whirlpool: 0.671,architect: 0.658,electrolux: 0.650,evolut: 0.647,washerdry: 0.645,dishwash: 0.641,cleansteel: 0.640,profil: 0.629
8,Canon (canon),eo: 0.803,xsi: 0.777,t2i: 0.775,pixma: 0.774,7d: 0.769,t3i: 0.763,rebel: 0.761,t1i: 0.761,55250mm: 0.754,60d: 0.753
9,Skullcandy (skullcandi),earbud: 0.748,gumi: 0.715,bud: 0.711,nano®: 0.704,ipod: 0.704,headphon: 0.694,touch®: 0.686,ifrogz: 0.653,armband: 0.643,4thgener: 0.605
