In [96]:
import pexpect
import pandas as pd
import subprocess
import string
import nltk
from nltk.stem.snowball import SnowballStemmer

In [97]:
%cd /workspace/datasets/fasttext

/home/jupyter/.kaggle/datasets/fasttext


## Extract titles

Following code for transformation is used:

In [98]:
translation_table = str.maketrans("", "", "®©™" + string.punctuation)

def transform_training_data(name):
    name = name.replace("\n", " ")
    name = name.lower()
    # remove punctuation
    name = name.translate(translation_table)
    tokens = nltk.word_tokenize(name)
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

In [99]:
%%bash
set -x
python /workspace/search_with_machine_learning_course/week3/extractTitles.py --sample_rate 1.0
wc -l /workspace/datasets/fasttext/titles.txt


+ python /workspace/search_with_machine_learning_course/week3/extractTitles.py --sample_rate 1.0


Writing results to /workspace/datasets/fasttext/titles.txt


+ wc -l /workspace/datasets/fasttext/titles.txt


115358 /workspace/datasets/fasttext/titles.txt


## Words used for evaluation

In [100]:
words = [
    "Printer", "Vaccum", "Headphones", "Binoculars", "Phones", 
    "Apple", "AMD", "Bosch", "Canon", "Skullcandy",
    "FinePix", "Macbook", "Wii", "Aspire", "Ipad",
    "16GB", "Black", "Bonus", "Wireless", "USB", 
]

## Evaluation

In [101]:
def run_exp(minCount = 0):
    # Train model
    dir = "/workspace/datasets/fasttext/"
    model = f"title_model.mc{minCount}"
    cmd = f"fasttext skipgram -input titles.txt -output {model} -minCount {minCount}"
    print(f"+ {cmd}")
    subprocess.run(cmd, shell=True, cwd=dir)

    # Eval on words
    rows = []
    cmd = f"fasttext nn {model}.bin"
    print(f"+ {cmd}")
    child=pexpect.spawn(cmd, encoding='utf-8')
    child.expect('Query word?')
    for input_word in words:
        word = transform_training_data(input_word)
        child.sendline(word)
        child.expect('Query word?')
        output = child.before
        lines = output.strip().split("\r\n")[1:]
        splits = [line.strip().split() for line in lines]
        row = [f"{input_word} ({word})"] + [f"{o[0]}: {float(o[1]):.3f}" for o in splits]
        rows.append(row)
    child.close()
    df = pd.DataFrame(rows)
    display(df)

### minCount = 0

In [102]:
run_exp(minCount=0)

+ fasttext skipgram -input titles.txt -output title_model.mc0 -minCount 0


Read 1M words
Number of words:  28416
Number of labels: 0
Progress: 100.0% words/sec/thread:   49374 lr:  0.000000 avg.loss:  1.555754 ETA:   0h 0m 0s


+ fasttext nn title_model.mc0.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),sprinter: 0.897,printz: 0.887,printerphotocaptur: 0.886,printercopierscann: 0.873,printercopierscannerfax: 0.867,printcalc: 0.860,printercopierscannerfaxphotocaptur: 0.850,mfc7860dw: 0.838,mfc9010cn: 0.837,mfc8480dn: 0.836
1,Vaccum (vaccum),vacuumblowerinfl: 0.910,cv2: 0.890,vac: 0.888,fjm: 0.883,cv: 0.873,liftoff: 0.872,vacuum: 0.872,rotosweep: 0.870,vax: 0.870,5l: 0.866
2,Headphones (headphon),headphoni: 0.967,headphonesear: 0.941,earbud: 0.909,overtheear: 0.890,multiheadphon: 0.879,yurbud: 0.878,behindtheear: 0.860,undertheear: 0.843,earbudmicrophon: 0.843,overthehead: 0.843
3,Binoculars (binocular),monocular: 0.909,ab10462: 0.864,ab10762: 0.863,ab10526: 0.863,ab10592: 0.862,ab10572: 0.861,ab10962: 0.860,ab10768: 0.854,ab10286: 0.853,ab10594: 0.845
4,Phones (phone),vphone: 0.958,phonem: 0.952,phonemp3: 0.891,phoneunlock: 0.883,phono: 0.860,phonefaxmodemansw: 0.847,i9: 0.845,cellphon: 0.839,phonic: 0.823,kyocera: 0.820
5,Apple (appl),apple­: 0.928,appleâ: 0.921,ipadâ: 0.893,ipad­: 0.891,ipad°: 0.888,ipadâ¢: 0.886,ipgx: 0.864,ip4: 0.863,ipodâ: 0.857,appleiphon: 0.854
6,AMD (amd),athlonâ¢: 0.924,athlon: 0.913,rm72: 0.911,phenomâ¢: 0.909,zm82: 0.906,tk53: 0.898,phenom: 0.897,quadcor: 0.896,wamd: 0.895,iiphenom: 0.893
7,Bosch (bosch),ascenta: 0.843,axxi: 0.842,steamdishwash: 0.819,boseâ: 0.807,talltub: 0.803,dishwash: 0.800,2378: 0.786,glmb209dq: 0.783,aquastop: 0.783,builtindishwash: 0.779
8,Canon (canon),icanon: 0.945,cannon: 0.872,eo: 0.819,hanon: 0.818,50d: 0.807,30d: 0.807,60d: 0.806,eos40d: 0.803,fujinon: 0.800,cann: 0.798
9,Skullcandy (skullcandi),skullz: 0.954,skull: 0.933,skulli: 0.924,skullcrush: 0.924,earbud: 0.873,sku: 0.862,yurbud: 0.857,budbud: 0.833,inkd: 0.832,bud: 0.813


### minCount = 10

In [103]:
run_exp(minCount=10)

+ fasttext skipgram -input titles.txt -output title_model.mc10 -minCount 10


Read 1M words
Number of words:  6021
Number of labels: 0
Progress: 100.0% words/sec/thread:   55256 lr:  0.000000 avg.loss:  1.421799 ETA:   0h 0m 0s


+ fasttext nn title_model.mc10.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),printercopierscann: 0.850,printercopierscannerfax: 0.848,inkjet: 0.800,copier: 0.798,officejet: 0.790,deskjet: 0.787,dx4860ub32p: 0.779,hl2170w: 0.776,fax: 0.775,pixma: 0.769
1,Vaccum (vaccum),vacuum: 0.856,roomba: 0.845,windtunnel: 0.841,liftoff: 0.838,vacmast: 0.834,canist: 0.831,vac: 0.828,vax: 0.825,bagless: 0.815,eureka: 0.811
2,Headphones (headphon),earbud: 0.872,overtheear: 0.869,overthehead: 0.804,yurbud: 0.797,earphon: 0.785,noiseisol: 0.778,behindtheneck: 0.765,skullcandi: 0.764,behindthehead: 0.752,bud: 0.735
3,Binoculars (binocular),scope: 0.776,barska: 0.768,bushnel: 0.739,2060: 0.704,celestron: 0.687,circular: 0.678,pentax: 0.660,altazimuth: 0.609,146mp: 0.606,popular: 0.604
4,Phones (phone),huawei: 0.801,kyocera: 0.778,gophon: 0.777,phono: 0.776,tmobil: 0.770,razr: 0.748,4g: 0.743,mobil: 0.742,pantech: 0.741,att: 0.740
5,Apple (appl),appleâ: 0.849,ipad: 0.784,ipod: 0.769,iphon: 0.763,ipodhd: 0.755,3rdgener: 0.732,4thgener: 0.721,6thgener: 0.720,3g3gs: 0.714,5thgener: 0.714
6,AMD (amd),athlon: 0.892,phenom: 0.883,tl60: 0.860,quadcor: 0.851,3gb: 0.845,am3: 0.839,x6: 0.829,turion: 0.826,x4: 0.823,x2: 0.802
7,Bosch (bosch),ascenta: 0.816,integra: 0.783,2378: 0.736,woodlik: 0.715,67: 0.713,tassimo: 0.711,dishwash: 0.709,50lb: 0.694,accubak: 0.692,talltub: 0.686
8,Canon (canon),eo: 0.814,60d: 0.786,t2i: 0.773,canoscan: 0.773,50d: 0.770,pixma: 0.767,7d: 0.767,sx230hs: 0.756,151mp: 0.752,xsi: 0.750
9,Skullcandy (skullcandi),skull: 0.892,earbud: 0.835,inkd: 0.831,hesh: 0.784,yurbud: 0.780,gumi: 0.776,bud: 0.774,smokin: 0.767,headphon: 0.764,2xl: 0.759


### minCount = 20

In [104]:
run_exp(minCount=20)

+ fasttext skipgram -input titles.txt -output title_model.mc20 -minCount 20


Read 1M words
Number of words:  3901
Number of labels: 0
Progress: 100.0% words/sec/thread:   59115 lr:  0.000000 avg.loss:  1.426001 ETA:   0h 0m 0s


+ fasttext nn title_model.mc20.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),printercopierscann: 0.840,officejet: 0.815,deskjet: 0.800,inkjet: 0.797,s51204: 0.778,eallinon: 0.778,copier: 0.777,pixma: 0.767,laserjet: 0.767,scanjet: 0.766
1,Vaccum (vaccum),vacuum: 0.805,windtunnel: 0.790,vac: 0.788,foodsav: 0.781,canist: 0.779,bagless: 0.778,filtret: 0.777,eureka: 0.762,roomba: 0.758,irobot: 0.725
2,Headphones (headphon),earbud: 0.894,overtheear: 0.855,noiseisol: 0.800,earphon: 0.797,bud: 0.776,skullcandi: 0.769,behindtheneck: 0.765,noisecancel: 0.756,hesh: 0.755,inkd: 0.725
3,Binoculars (binocular),barska: 0.790,scope: 0.787,bushnel: 0.753,circular: 0.704,celestron: 0.695,pentax: 0.650,mead: 0.642,refractor: 0.607,spot: 0.603,waterproof: 0.600
4,Phones (phone),kyocera: 0.788,4g: 0.751,gophon: 0.750,pantech: 0.744,8520: 0.743,att: 0.737,razr: 0.732,8530: 0.732,ericsson: 0.731,nocontract: 0.730
5,Apple (appl),appleâ: 0.842,iphon: 0.752,ipod: 0.728,ipad: 0.718,ipodiphon: 0.678,radioappl: 0.657,3g3gs: 0.654,4thgener: 0.637,4s: 0.634,3gs: 0.632
6,AMD (amd),athlon: 0.899,tl60: 0.853,phenom: 0.842,turion: 0.841,quadcor: 0.829,3gb: 0.829,sempron: 0.825,x2: 0.812,x4: 0.804,biscotti: 0.791
7,Bosch (bosch),ascenta: 0.820,integra: 0.737,67: 0.709,tassimo: 0.707,2378: 0.705,dishwash: 0.695,accubak: 0.693,maytag: 0.675,15cycl: 0.674,blackonstainless: 0.673
8,Canon (canon),eo: 0.789,t2i: 0.778,xsi: 0.773,60d: 0.766,t3: 0.764,7d: 0.763,pixma: 0.762,t1i: 0.758,t3i: 0.752,sx150: 0.748
9,Skullcandy (skullcandi),inkd: 0.819,hesh: 0.806,earbud: 0.804,smokin: 0.793,lowrid: 0.772,headphon: 0.769,bud: 0.766,2xl: 0.758,noiseisol: 0.744,gumi: 0.715


### minCount = 50

In [105]:
run_exp(minCount=50)

+ fasttext skipgram -input titles.txt -output title_model.mc50 -minCount 50


Read 1M words
Number of words:  2152
Number of labels: 0
Progress: 100.0% words/sec/thread:   69814 lr:  0.000000 avg.loss:  1.483654 ETA:   0h 0m 0s


+ fasttext nn title_model.mc50.bin


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Printer (printer),officejet: 0.813,pixma: 0.794,inkjet: 0.784,laserjet: 0.776,copier: 0.768,networkreadi: 0.745,fax: 0.719,toner: 0.715,lexmark: 0.712,blackandwhit: 0.711
1,Vaccum (vaccum),vacuum: 0.864,vac: 0.852,bagless: 0.828,canist: 0.792,eureka: 0.762,filtret: 0.745,wetdri: 0.742,hoover: 0.741,hepa: 0.729,bissel: 0.715
2,Headphones (headphon),earbud: 0.852,overtheear: 0.817,noisecancel: 0.749,bud: 0.722,skullcandi: 0.700,microphon: 0.684,inear: 0.672,ear: 0.671,sennheis: 0.669,clipon: 0.654
3,Binoculars (binocular),scope: 0.783,bushnel: 0.770,barska: 0.768,celestron: 0.742,pentax: 0.677,zoom: 0.587,waterproof: 0.578,spot: 0.571,blackhawk: 0.557,wideangl: 0.537
4,Phones (phone),att: 0.748,kyocera: 0.740,motorola: 0.733,handset: 0.730,razr: 0.728,verizon: 0.727,nokia: 0.726,cellstar: 0.711,ericsson: 0.700,smartphon: 0.696
5,Apple (appl),iphon: 0.752,ipad: 0.728,ipod: 0.699,3gs: 0.636,4s: 0.627,4thgener: 0.625,3rdgener: 0.613,tribeca: 0.595,portfolio: 0.589,incas: 0.582
6,AMD (amd),athlon: 0.894,turion: 0.850,phenom: 0.830,3gb: 0.826,x2: 0.802,x4: 0.795,pentium: 0.787,dualcor: 0.740,i3: 0.709,msi: 0.708
7,Bosch (bosch),67: 0.679,maytag: 0.660,washerdry: 0.660,ge: 0.656,architect: 0.650,evolut: 0.646,cleansteel: 0.645,bisqueonbisqu: 0.636,dishwash: 0.635,whirlpool: 0.629
8,Canon (canon),eo: 0.751,t1i: 0.749,t2i: 0.745,xsi: 0.742,pixma: 0.739,7d: 0.732,t3i: 0.725,t3: 0.720,rebel: 0.710,elph: 0.709
9,Skullcandy (skullcandi),bud: 0.761,earbud: 0.758,ifrogz: 0.718,gumi: 0.715,headphon: 0.700,armband: 0.657,ipod: 0.621,altec: 0.620,stereo: 0.619,ear: 0.619
