# Fasttext

In [1]:
import fasttext

## Split

In [2]:
!rm -rf products.train && head -10000 /workspace/datasets/fasttext/shuffled_labeled_products.txt > products.train

In [3]:
!rm -f products.test && tail -15000 /workspace/datasets/fasttext/shuffled_labeled_products.txt > products.test

In [6]:
# Train model
model = fasttext.train_supervised(input="products.train")

Read 1M words
Number of words:  34339
Number of labels: 1951
Progress: 100.0% words/sec/thread:    3436 lr:  0.000000 avg.loss:  4.711692 ETA:   0h 0m 0s


In [None]:
model.save_model("product_classifier.bin")

In [70]:
def evaluate(_model, ds):
    for k in [1, 5, 10]:
        res = _model.test(ds, k)
        print(f"P@{k}: {res[1]}, R@{k}: {res[2]}")


In [12]:
# Evaluate on test data
model.test("products.test", 1)

(14985, 0.5807807807807808, 0.5807807807807808)

In [11]:
model.test("products.test", 1)

(14985, 0.08341675008341674, 0.8341675008341675)

## Tuning

In [6]:
model = fasttext.train_supervised(input="products.train", wordNgrams=2, epoch=25, lr=1.0)

Read 0M words
Number of words:  11163
Number of labels: 1365
Progress: 100.0% words/sec/thread:    4633 lr:  0.000000 avg.loss:  1.229115 ETA:   0h 0m 0s


In [7]:
evaluate(model, "products.test")

P@1: 0.612304080646275, R@1: 0.612304080646275
P@5: 0.1621763446799696, R@5: 0.8108817233998481
P@10: 0.08551405095629358, R@10: 0.8551405095629359


Improvement - normalize the data

In [8]:
!cat /workspace/datasets/fasttext/shuffled_labeled_products.txt |sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" | sed "s/[^[:alnum:]_]/ /g" | tr -s ' ' > /workspace/datasets/fasttext/shuffled_labeled_products_normalized.txt


In [10]:
!rm -rf products_normalized.train && head -10000 /workspace/datasets/fasttext/shuffled_labeled_products_normalized.txt > products_normalized.train
!rm -rf products_normalized.test && tail -15000 /workspace/datasets/fasttext/shuffled_labeled_products_normalized.txt > products_normalized.test

In [14]:
model = fasttext.train_supervised(input="products_normalized.train", wordNgrams=2, epoch=25, lr=1.0)

Read 0M words
Number of words:  8703
Number of labels: 1365
Progress: 100.0% words/sec/thread:    7884 lr:  0.000000 avg.loss:  1.141257 ETA:   0h 0m 0s


In [15]:
evaluate(model, "products_normalized.test")

P@1: 0.6096803148518953, R@1: 0.6096803148518953
P@5: 0.16167921010840294, R@5: 0.8083960505420148
P@10: 0.08516881861492785, R@10: 0.8516881861492784


### Some categories have very few samples. Drop 'em

In [1]:
import pandas as pd

In [11]:
ds = pd.read_fwf("/workspace/datasets/fasttext/labeled_products.txt", header=None, names=["line"])

In [12]:
ds

Unnamed: 0,line
0,"__label__abcat0107029 Recoton - 1/8"" Mini Ster..."
1,__label__abcat0202007 Panasonic - Technics Qua...
2,__label__abcat0908001 Holmes - Replacement Fil...
3,__label__abcat0107031 Monster Cable - 10' Pair...
4,"__label__pcmcat223000050008 Pioneer - 4"" 3-Way..."
...,...
115498,__label__pcmcat234200050001 Nikon 1 J1 10.1MP ...
115499,__label__pcmcat234200050001 Nikon 1 J1 10.1MP ...
115500,__label__pcmcat234200050001 Nikon 1 J1 10.1MP ...
115501,__label__pcmcat234200050001 Nikon 1 J1 Pink 10...


In [19]:
ds[["label", "text"]] = ds["line"].str.split(" ", 1, expand=True)
ds = ds.drop(columns="line")

In [20]:
ds

Unnamed: 0,label,text
0,__label__abcat0107029,"Recoton - 1/8"" Mini Stereo 3.5mm Y Adapter"
1,__label__abcat0202007,Panasonic - Technics Quartz Synthesizer Direct...
2,__label__abcat0908001,Holmes - Replacement Filter for Select Holmes ...
3,__label__abcat0107031,Monster Cable - 10' Pair of 10-Gauge Speaker Wire
4,__label__pcmcat223000050008,"Pioneer - 4"" 3-Way Surface-Mount Speakers with..."
...,...,...
115498,__label__pcmcat234200050001,"Nikon 1 J1 10.1MP White Camera Kit, Bag, 8GB M..."
115499,__label__pcmcat234200050001,"Nikon 1 J1 10.1MP Red Camera Kit, Bag, 8GB Mem..."
115500,__label__pcmcat234200050001,"Nikon 1 J1 10.1MP Red Camera Kit, Bag, 8GB Mem..."
115501,__label__pcmcat234200050001,"Nikon 1 J1 Pink 10.1MP Camera Kit, Bag, 8GB Me..."


In [52]:
MIN_SAMPLES_PER_LABEL = 500

tmp = (ds["label"].value_counts() > MIN_SAMPLES_PER_LABEL).to_frame()
thresholded_labels = tmp[tmp["label"] == True].index.values

thresholded_labels

array(['__label__abcat0101001', '__label__pcmcat180400050006',
       '__label__abcat0401004', '__label__pcmcat247400050000',
       '__label__cat09000', '__label__abcat0901005',
       '__label__abcat0515028', '__label__abcat0905001',
       '__label__pcmcat171900050029', '__label__abcat0904003',
       '__label__pcmcat151600050006', '__label__pcmcat174700050005',
       '__label__abcat0711001', '__label__abcat0712001',
       '__label__pcmcat212600050008', '__label__abcat0106001',
       '__label__pcmcat165900050033', '__label__abcat0701002',
       '__label__abcat0707002', '__label__abcat0706002',
       '__label__pcmcat144700050004', '__label__abcat0901006',
       '__label__abcat0502003', '__label__pcmcat212600050011',
       '__label__abcat0703002', '__label__abcat0301014',
       '__label__abcat0910004', '__label__abcat0704003',
       '__label__pcmcat227500050028', '__label__abcat0904001',
       '__label__abcat0504010', '__label__abcat0903001'], dtype=object)

In [73]:
ds = ds[ds["label"].isin(thresholded_labels)]

In [74]:
ds.to_csv("/workspace/datasets/fasttext/pruned_labeled_products.txt", header=None, sep=" ", index=False)

In [75]:
!head -5 "/workspace/datasets/fasttext/pruned_labeled_products.txt"

__label__pcmcat165900050033 "Metra - Radio Installation Dash Kit for Most 1989-2000 Ford, Lincoln & Mercury Vehicles - Black"
__label__pcmcat165900050033 "Metra - Radio Dash Multikit for Select GM Vehicles - Black"
__label__pcmcat165900050033 "Best Buy - Mazda Multi In-Dash Installation Kit"
__label__abcat0101001 "Dynex™ - 32"" Class / 720p / 60Hz / LCD HDTV"
__label__abcat0101001 "Insignia™ - 32"" Class / 720p / 60Hz / LCD HDTV"


In [76]:
!shuf /workspace/datasets/fasttext/pruned_labeled_products.txt > /workspace/datasets/fasttext/shuffled_pruned_labeled_products.txt

In [77]:
!rm -rf products_pruned.train && head -10000 /workspace/datasets/fasttext/shuffled_pruned_labeled_products.txt > products_pruned.train
!rm -rf products_pruned.test && tail -15000 /workspace/datasets/fasttext/shuffled_pruned_labeled_products.txt > products_pruned.test

In [78]:
import fasttext
model = fasttext.train_supervised(input="products_pruned.train", wordNgrams=2, epoch=25, lr=1.0)

Read 0M words
Number of words:  7329
Number of labels: 32
Progress: 100.0% words/sec/thread:  126959 lr:  0.000000 avg.loss:  0.037155 ETA:   0h 0m 0s


In [79]:
evaluate(model, "products_pruned.test")

P@1: 0.9682666666666667, R@1: 0.9682666666666667
P@5: 0.19962666666666667, R@5: 0.9981333333333333
P@10: 0.0999, R@10: 0.999


# Synonyms

In [83]:
!rm -rf /workspace/datasets/fasttext/titles.txt && cut -d' ' -f2- /workspace/datasets/fasttext/shuffled_labeled_products.txt > /workspace/datasets/fasttext/titles.txt

In [84]:
!head /workspace/datasets/fasttext/titles.txt

iPod classic®  - Silver, Griffin Technology Arm Band, Charger, Sony Earbuds
Bush - Sonata TV Stand for Flat-Panel TVs Up to 50"
LG - 30" Self-Cleaning Freestanding Double Oven Electric Convection Range - Stainless-Steel
Whirlpool - 17.6 Cu. Ft. Frost-Free Top-Mount Refrigerator - Stainless-Steel
Trademark Games - Florida Panthers 8-Cue Wall Rack
Toshiba - REGZA / 40" Class / 1080p / 120Hz / LCD HDTV
Belkin - Meta 028 Case for 4th-Generation Apple® iPod® touch - Black
Crate - FW120 Flexwave 120W RMS Guitar Combo Amplifier
InnoView - 22" Class - LED - 1080p - 60Hz - HDTV
Motorola - Leather Pouch for Motorola T193 Cellular Phones


In [85]:
# Skipgram model :
model = fasttext.train_unsupervised('/workspace/datasets/fasttext/titles.txt', model='skipgram')

Read 1M words
Number of words:  10873
Number of labels: 0
Progress: 100.0% words/sec/thread:   18696 lr:  0.000000 avg.loss:  1.346618 ETA:   0h 0m 0s


In [1]:
import pprint

def eval_nn(_model):
    for query in ["iphone", "iPhone", "ps2", "headphones", "plasma", "nintendo"]:
        print(f"Query: {query}")
        pprint.pprint(_model.get_nearest_neighbors(query, 10))

In [96]:
# Normalize titles
!cat /workspace/datasets/fasttext/titles.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" | sed "s/[^[:alnum:]]/ /g" | tr -s ' ' > /workspace/datasets/fasttext/normalized_titles.txt

In [102]:
model = fasttext.train_unsupervised('/workspace/datasets/fasttext/normalized_titles.txt', model='skipgram')

Read 1M words
Number of words:  8903
Number of labels: 0
Progress: 100.0% words/sec/thread:   20368 lr:  0.000000 avg.loss:  1.468102 ETA:   0h 0m 0s


In [105]:
eval_nn(model)

Query: iphone
[(0.862663209438324, '4s'),
 (0.7868690490722656, '3gs'),
 (0.7689868807792664, 'apple'),
 (0.7299811840057373, 'ozone'),
 (0.7227146029472351, 'ifrogz'),
 (0.7131742835044861, 'ipod'),
 (0.7096652984619141, 'fabshell'),
 (0.7002969980239868, 'ipadÂ'),
 (0.6982929110527039, 'phone'),
 (0.6978291869163513, 'gophone')]
Query: iPhone
[(0.9215518236160278, 'hone'),
 (0.8393056988716125, 'phone'),
 (0.8265119791030884, 'ozone'),
 (0.8260446786880493, 'gophone'),
 (0.7845501899719238, 'speakerphone'),
 (0.7734159827232361, 'iphone'),
 (0.7519534826278687, 'bone'),
 (0.7399531602859497, 'saxophone'),
 (0.7317371964454651, 'jawbone'),
 (0.7148262858390808, 'cone')]
Query: ps2
[(0.8454468846321106, 'ps3'),
 (0.8109826445579529, 'psp'),
 (0.8108870983123779, 'gba'),
 (0.8095378875732422, '2k5'),
 (0.8088458776473999, '2k3'),
 (0.8039939999580383, 'nhl'),
 (0.8030414581298828, '2k8'),
 (0.8023243546485901, 'wwe'),
 (0.7981027364730835, '2k9'),
 (0.7947553992271423, '2k6')]
Query: he

In [2]:
import fasttext
model = fasttext.train_unsupervised('/workspace/datasets/fasttext/normalized_titles.txt', model='skipgram', epoch=25)

Read 1M words
Number of words:  8903
Number of labels: 0
Progress: 100.0% words/sec/thread:   19813 lr:  0.000000 avg.loss:  1.058889 ETA:   0h 0m 0s 43.6% words/sec/thread:   19767 lr:  0.028184 avg.loss:  1.220267 ETA:   0h 0m53s


In [3]:
eval_nn(model)

Query: iphone
[(0.881649374961853, '4s'),
 (0.8275116682052612, 'apple'),
 (0.7619662880897522, 'ipod'),
 (0.6918696761131287, 'fabshell'),
 (0.6893593668937683, 'ipad'),
 (0.6855179071426392, '3gs'),
 (0.6779516935348511, 'nauticase'),
 (0.6434628367424011, '4th'),
 (0.6306478381156921, 'candyshell'),
 (0.6174927353858948, 'lifeproof')]
Query: iPhone
[(0.8479858636856079, 'hone'),
 (0.7468584775924683, 'phone'),
 (0.6568116545677185, 'ozone'),
 (0.6489620804786682, 'gophone'),
 (0.6449326276779175, 'speakerphone'),
 (0.6425195932388306, 'answering'),
 (0.6382327675819397, 'cortelco'),
 (0.6302291750907898, 'shonen'),
 (0.6291709542274475, 'caller'),
 (0.620538592338562, 'muzx')]
Query: ps2
[(0.7454320192337036, 'playstation'),
 (0.7328281402587891, 'ps3'),
 (0.7267951369285583, 'gamecube'),
 (0.7166802883148193, 'gba'),
 (0.7124346494674683, 'xbox'),
 (0.6846213936805725, 'psp'),
 (0.677823543548584, '360'),
 (0.6599013209342957, 'guide'),
 (0.6281841993331909, 'ds'),
 (0.626789212226

In [4]:
import fasttext
model = fasttext.train_unsupervised('/workspace/datasets/fasttext/normalized_titles.txt', model='skipgram', epoch=25, minCount=20)
eval_nn(model)

Read 1M words
Number of words:  3861
Number of labels: 0
Progress:  99.8% words/sec/thread:   22748 lr:  0.000095 avg.loss:  1.188973 ETA:   0h 0m 0s

Query: iphone
[(0.8669317364692688, '4s'),
 (0.7902259230613708, 'apple'),
 (0.7400349378585815, 'ipod'),
 (0.7053563594818115, '3gs'),
 (0.6725506782531738, 'ipad'),
 (0.6256548166275024, '4th'),
 (0.5602055788040161, 'generation'),
 (0.5528930425643921, '3g'),
 (0.5386014580726624, 'earbud'),
 (0.5361412763595581, 'mophie')]
Query: iPhone
[(0.74390709400177, 'telephone'),
 (0.7369771599769592, 'phone'),
 (0.6686997413635254, 'waiting'),
 (0.6655545830726624, 'speakerphone'),
 (0.660749077796936, 'answering'),
 (0.6539176106452942, 'handset'),
 (0.6366216540336609, 'gigaset'),
 (0.6337408423423767, 'gophone'),
 (0.6271347999572754, 'dect'),
 (0.621921181678772, 'headphone')]
Query: ps2
[(0.7587485909461975, 'playstation'),
 (0.7549052238464355, 'xbox'),
 (0.7451379895210266, 'gba'),
 (0.7213716506958008, 'gamecube'),
 (0.7184588313102722, 'ps3'),
 (0.7118987441062927, '360'),
 (0.7104285359382629, 'guide'),
 (0.685433566570282, 'psp'),
 (0.6524907350540161, 'ds'),
 (0.6503873467445374

Progress: 100.0% words/sec/thread:   22736 lr:  0.000000 avg.loss:  1.188628 ETA:   0h 0m 0s


In [5]:
model.save_model("/workspace/datasets/fasttext/title_model.bin")

In [3]:
import fasttext

model = fasttext.load_model("/workspace/datasets/fasttext/title_model.bin")



In [4]:
import pandas as pd
top_words = pd.read_csv("/workspace/datasets/fasttext/top_words.txt", header=None)
top_words.columns = ["word"]
top_words

Unnamed: 0,word
0,black
1,with
2,digital
3,white
4,case
...,...
995,waiting
996,razor
997,pine
998,noise


In [5]:
def get_nearest_neighbors(query, model, threshold):
    nns = model.get_nearest_neighbors(query, 2000)
    
    return [word for (distance, word) in nns if distance > threshold]

In [6]:
get_nearest_neighbors("bob", model, 0.75)

[]

In [7]:
top_words["synonyms"] = top_words["word"].map(lambda word: get_nearest_neighbors(word, model, 0.75))

In [8]:
pd.set_option('display.max_rows', 20)
top_words.head(20)

Unnamed: 0,word,synonyms
0,black,[]
1,with,[]
2,digital,[]
3,white,[]
4,case,[]
5,memory,"[4gb, 8gb]"
6,camera,[]
7,windows,[mac]
8,card,[]
9,apple,"[ipod, ipad, iphone]"


In [21]:
top_words["4_export"] = top_words \
    .apply(lambda row: ",".join([row.word] + row.synonyms), axis=1)

In [22]:
top_words.head(6)

Unnamed: 0,word,synonyms,4_export
0,black,[],black
1,with,[],with
2,digital,[],digital
3,white,[],white
4,case,[],case
5,memory,"[4gb, 8gb]","memory,4gb,8gb"


In [23]:
top_words[["4_export"]].to_csv("/workspace/datasets/fasttext/synonyms.csv", header=None, index=False, sep=" ")

In [24]:
!head -20 "/workspace/datasets/fasttext/synonyms.csv"

black
with
digital
white
case
memory,4gb,8gb
camera
windows,mac
card
apple,ipod,ipad,iphone
nintendo,ds,wii,gamecube,3ds
electric
sony
drive,hard,500gb
laptop
battery
wireless
silver
series
hard,drive


: 