In [1]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import os
import numpy as np
import random 
import json

In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


2021-12-07 14:59:44.076877: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-12-07 14:59:44.076903: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-12-07 14:59:44.150562: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-12-07 14:59:44.150581: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-12-07 14:59:44.224248: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-12-07 14:59:44.224264: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on you

In [3]:
os.chdir('O:/Amazon/2021/AMZ210470 Major Appliances/1_Behavioral Data Analysis/output')
data_file = '02_Relevant Sessions_Detail.csv'
df = pd.read_csv(data_file, encoding="ISO-8859-1")

In [4]:
nlp = spacy.load("en_core_web_lg")

In [5]:
titles = [_ for _ in df['keyword_search'].dropna().str.lower()]

# Rules based keyword matching for labeling

In [6]:
range_pattern1 = [{"LEMMA": {"IN":["range","stove", "burner", "cooktop","stovetop"]}, "POS": "NOUN"},
                  {"LOWER":{"IN":["built","top"]}}, 
                  {"LOWER":{"NOT_IN":["stuffing"]}}]

refridgerator_pattern = [{'LEMMA':{'IN':['refridgerator', 'fridge', 'freezer']}, "DEP":{"NOT_IN":["attr"]}}]

appliances_pattern = [{"LEMMA":{"IN":["kitchen"]}, "LEMMA":{"NOT_IN":["small"]}},
                      {'LEMMA':'appliance'},
                     {"POS":"NOUN", "OP":"?"}]

microwave_pattern = [{"IS_PUNCT":False, "OP":"*"},
                      {'LEMMA':'microwave', "DEP":{"NOT_IN":["appos", "pobj"]}},
                    {"POS":"PROPN", "OP": "!", 'LOWER':'oven', "OP":"*","IS_PUNCT":False}]

oven_pattern = [{"LEMMA":{"NOT_IN":["brick", "pizza"]},"POS":{"NOT_IN":["VERB"]}},
                 {"LEMMA":"oven"}]

washer_pattern1 = [{"LEMMA":{"NOT_IN":["window", "power"]}},
                  {'LEMMA':{"IN":["washer", "dishwasher", "washing"]}},
                 {"LOWER":{"NOT_IN":["pods","detergent","repair","pacs","method"]}}]
                                     
dryer_pattern = [{'LEMMA':'dryer'},
                 {"LEMMA":{"NOT_IN":["sheets"]}}]

In [7]:
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("RANGE", [range_pattern1])
matcher.add("FRIDGE", [refridgerator_pattern])
matcher.add("APPLIANCE", [appliances_pattern])
matcher.add("MICROWAVE", [microwave_pattern])
matcher.add("OVEN", [oven_pattern])
matcher.add("WASHER", [washer_pattern1])
matcher.add("DRYER", [dryer_pattern])

In [8]:
g = (d for d in nlp.pipe(titles) if len(matcher(d)) > 0)
for i in range(10):
    print(next(g))

mini fridge
lg builtin oven'
over the oven microwave
fridge
lg wall oven
electric dryers in stock
mini fridge
microwave
smart oven pro
top load washer dryer sets


In [9]:
test_text = nlp("washer and dryer set")
[(i.text, i.pos_, i.dep_, i.lemma_ 
  #, [(i.text, i.label_) for i in test_text.ents]
 ) for i in nlp(test_text)]

[('washer', 'NOUN', 'nsubj', 'washer'),
 ('and', 'CCONJ', 'cc', 'and'),
 ('dryer', 'NOUN', 'conj', 'dryer'),
 ('set', 'NOUN', 'ROOT', 'set')]

In [10]:
for match_id, start, end in matcher(nlp(test_text)):
    print("text: ",test_text, "\n", "Matches: ",test_text[start: end])

text:  washer and dryer set 
 Matches:  dryer set


In [9]:
# prettier way to display
from IPython.display import HTML as html_print

def style(s, bold=False):
    blob = f"<text>{s}</text>"
    if bold:
        blob = f"<b style='background-color: #fff59d'>{blob}</b>"
    return blob

def html_generator(g, n=10):
    blob = ""
    for i in range(n):
        doc = next(g)

        state = [[t, False] for t in doc]
        for idx, start, end in matcher(doc):
            for i in range(start, end):
                state[i][1] = True
        blob += style(' '.join([style(str(t[0]), bold=t[1]) for t in state]) + '<br>') 
    return blob

In [10]:
g = (d for d in nlp.pipe(titles) if matcher(d))
html_print(html_generator(g, n=10))

# Keyword similarity for labeling

In [11]:
from gensim.models import Word2Vec
import gensim
from gensim.utils import simple_preprocess
import os
# kernel kept crashing due to duplicate library - temporary fix
os.environ['KMP_DUPLICATE_LIB_OK']='True'



In [12]:
segments = []

for segment in titles:
    segment = simple_preprocess(segment, deacc = True)
    if len(segment) > 5:
        segments.append(segment)
print(segments[2])

['ge', 'jvm', 'rfss', 'over', 'the', 'range', 'microwave', 'oven', 'in', 'white']


In [13]:
# train on word vectors 
model = Word2Vec(segments, min_count=5)
model.save("demo.bin")

In [14]:
# test on keyword
model = Word2Vec.load("demo.bin")
keyword = "samsung"

In [15]:
res = model.wv.similar_by_word(keyword, topn=20)

In [16]:
for item in res:
    print(item)

('operated', 0.5106416940689087)
('glass', 0.4867037534713745)
('steam', 0.48629769682884216)
('dryer', 0.4719146192073822)
('full', 0.4703497886657715)
('with', 0.46883806586265564)
('washer', 0.4675185978412628)
('stainless', 0.4668442904949188)
('cu', 0.4636915624141693)
('machine', 0.46108725666999817)
('electric', 0.45737460255622864)
('capacity', 0.45009106397628784)
('ft', 0.44807517528533936)
('gas', 0.44602328538894653)
('fridge', 0.4452834129333496)
('steel', 0.4344782829284668)
('load', 0.4256725013256073)
('efficiency', 0.4254148006439209)
('commercial', 0.42449119687080383)
('and', 0.4223501682281494)


In [26]:
# use above to generate search word list along with relevant keywords list from excel  

In [17]:
search_words = ["microwave", "induction","griddle","broiler","dishwasher", 
                "refridgerator", "cooktop" ,"dryer", "fridge", "freezer", 
                "oven", "washing", "convection","machines","samsung","efficiency","load","commercial"]

In [18]:
train_data = []

for segment in segments:
    for word in search_words:
        if word in segment:
            match = True
    if match == True:
        segment = " ".join(segment)
        train_data.append((segment, 1))

In [19]:
print([train_data[random.randint(0, len(train_data))]])

[('samsung stormwash top control built in dishwasher with autorelease dry', 1)]
