# In this notebook

We extend the latter half of notebook `16` for all 4 categories
- as in notebook `16`
    - we will derive a priority order of attributes
    - we will find similar attributes using nearest neighbor search
- in this notebook
    - we will investigate what the similar phrases look like for each of our categories
    - we will compute pos and neg sub-clusters 
        - we will estimate requisite probabilities
    - we will store the top few similar meaning phrases in the database to serve out using the API
    

In [1]:
import pandas as pd
import json

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 1000

In [2]:
from sqlalchemy import create_engine
import psycopg2 
import io

In [3]:
import os
import glob

In [4]:
import pickle

In [5]:
import numpy as np

# Load attributes from database

In [6]:
conn_string = 'postgresql+psycopg2://gabbydbuser:gabbyDBpass@localhost:5432/gabbyDB'

In [7]:
db = create_engine(conn_string)
conn = db.connect()

# Building an attribute similarity module for easier saving and manual curation

- We build a class encompassing data and functions so that we can save and load models and vectors and other required data to efficiently compute nearest neighbor groups for manual curation

- Most of the reference code is in the Reference section (below)


In [108]:
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
tqdm.pandas()
import random
import numpy as np
import pickle


class SimilarAttributes(object):
    
    def __init__(self, category, db_conn):
        self.category = category

        print('Fetching attributes from DB')
        self.key_phrases = self._fetch_key_phrases_from_DB(db_conn)
        self.attributes = self.key_phrases['phrase'].tolist()

        print('Generating vectors matrix')
        self.phrase_vectors_arr = self._gen_spacy_vector_matrix()

        print('Building Nearest Neighbor Model')
        self.nn_model = self._build_nn_model()

        print('Computing mean intra-cluster distance')
        self.all_clusters_mean_distance = self._compute_mean_intra_cluster_distance()

        print('Computing priority order of attributes')
        self.priority_order = self.key_phrases.sort_values(['n_reviewers', 'n_products'], ascending=False)['key_phrase_id'].index.tolist()

        self.qphrases = {
            'laptop': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 
                        'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 
                        'Fast', 'long battery life', 'Noise level'],
            'monitor': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 
                        'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast'],
            'headphone': ['Good sound quality', 'comfortable', 'long battery life', 'Easy to set up', 
                            'Easy to use', 'Quality build', 'effective noise cancelling', 'Good bass', 'Call quality', 'Attractive'],
            'mouse': ['comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 
                        'Attractive', 'Noise level', 'Easy to clean'],
            'tv': ['Easy to use', 'Easy to set up', 'Sound quality', 'Fast input response', 'lightweight',
                     'Attractive', 'quality build', 'lacks durability', 'speed', 'craftsmanship', 'camera quality', 'remote quality']
        }


    def get_similar_attributes(self, qphrase, max_neighbors=100):
        qphrase_neighbors = self._nearest_neighbor_expansion(qphrase, max_neighbors=max_neighbors)
        qphrase_neighbors['key_phrase_id'] = qphrase_neighbors['neighbor_idx'].apply(lambda _row: self.key_phrases.iloc[_row]['key_phrase_id'])
        qattr_nn = qphrase_neighbors.merge(self.key_phrases, how='left', left_on='key_phrase_id', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)
        qattr_nn['qphrase'] = qphrase
        return qattr_nn


    def _nearest_neighbor_expansion(self, qphrase, max_neighbors=100):
        # initialization
        qvec = nlp(qphrase).vector
        neighbor_distances, neighbor_indices = self.nn_model.kneighbors([qvec], n_neighbors=10)
        neighbor_indices = neighbor_indices[0]
        neighbor_distances =neighbor_distances[0]
        neighbor_attributes = [self.attributes[i] for i in neighbor_indices]
        cluster_mean_dist = neighbor_distances.mean()
        
        #print(neighbor_attributes)
        lc = 1      # increase in number of neighbors (10, 101, 10) to search for nearest neighor expansion
        while cluster_mean_dist < self.all_clusters_mean_distance and lc < max_neighbors/lc + 1:
            # we need to find more neighbors for this phrase until the mean intra-cluster distance is greater than the mean
            q_idx = neighbor_indices
            #print(len(q_idx))
            qvec = self.phrase_vectors_arr[q_idx].mean(axis=0)
            #print(qvec)
            distances, neighbors = self.nn_model.kneighbors([qvec], n_neighbors=10*lc)
            distances = distances[0]
            neighbors = neighbors[0]
            for knn in range(len(neighbors)):
                #print(knn, neighbors[knn])
                if neighbors[knn] not in neighbor_indices:
                    #print('added')
                    neighbor_indices = np.append(neighbor_indices, neighbors[knn])
                    neighbor_distances = np.append(neighbor_distances, distances[knn])
                    neighbor_attributes.append(self.attributes[neighbors[knn]])
            cluster_mean_dist = neighbor_distances.mean()
            lc += 1
        mean_distance_occurrence_phrases = pd.DataFrame({
            'neighbor_idx': neighbor_indices,
            'neighbor_attr': neighbor_attributes,
            'neighbor_distances': neighbor_distances,
        })
        
        return mean_distance_occurrence_phrases


    def _compute_mean_intra_cluster_distance(self):
        SAMPLE_SIZE = min(10000, len(self.attributes))
        all_clusters_mean_distance = 0 
        for i in tqdm(random.sample(range(len(self.attributes)), SAMPLE_SIZE)):
            qvec = self.phrase_vectors_arr[i]
            distances, neighbors = self.nn_model.kneighbors([qvec], n_neighbors=10)
            all_clusters_mean_distance += distances[0].mean()
        all_clusters_mean_distance /= SAMPLE_SIZE
        return all_clusters_mean_distance


    def _build_nn_model(self):
        nn_model = NearestNeighbors()
        nn_model.fit(self.phrase_vectors_arr)
        return nn_model

        
    def _gen_spacy_vector_matrix(self): 
        phrase_vectors = []
        self.key_phrases['phrase'].progress_apply(lambda p: phrase_vectors.append(nlp(p).vector))
        return np.vstack(phrase_vectors)

        
    def _fetch_key_phrases_from_DB(self, conn):
        key_phrases_query = \
            f'''
            SELECT KP.*, KS.n_reviews, KS.n_positive, KS.n_negative, KS.n_reviewers, KS.n_products
            FROM key_phrase_root KP, key_phrase_scores KS
            WHERE KP.key_phrase_id=KS.key_phrase_id
                and KP.category='{self.category}'
            '''
        key_phrases = pd.read_sql(key_phrases_query, conn)
        print('key_phrases.shape', key_phrases.shape)
        return key_phrases

    

    

In [109]:
category = 'headphone'
sim_attr = SimilarAttributes(category, conn)

Fetching attributes from DB
key_phrases.shape (8258, 8)
Generating vectors matrix


  0%|          | 0/8258 [00:00<?, ?it/s]

Building Nearest Neighbor Model
Computing mean intra-cluster distance


  0%|          | 0/8258 [00:00<?, ?it/s]

Computing priority order of attributes


In [110]:
with open(f"""similar_attributes_model_{category}""",'wb') as safile:
    pickle.dump(sim_attr, safile)

In [42]:
with open(f"""similar_attributes_model_{category}""",'rb') as safile:
    sim_attr = pickle.load(safile)

## Select curated terms
3 step process
1. pick those phrases that generate better looking terms; note the top ranking threshold for each phrase if needed
2. pick good looking ones from priority_order list; not the top ranking threshold for each phrase if needed
3. save the manually curated phrases and their respective top similar phrases into db



In [None]:
# some terms to search for after the similar attribute models have been saved/loaded 
'attractive design'

### curating pre-selected attributes

In [111]:
curated_attributes = pd.DataFrame()

In [112]:
sim_attr.category, sim_attr.qphrases[sim_attr.category]

('headphone',
 ['Good sound quality',
  'comfortable',
  'long battery life',
  'Easy to set up',
  'Easy to use',
  'Quality build',
  'effective noise cancelling',
  'Good bass',
  'Call quality',
  'Attractive'])

### curating priority ordered phrases

In [173]:
sim_attr.key_phrases.loc[sim_attr.priority_order[:25]] 

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
32,328262,headphones,headphone,1072,777,295,1026,76
35,328265,stars,headphone,803,725,78,796,75
12,328242,sound,headphone,747,559,188,732,75
57,328294,great,headphone,628,517,111,620,74
438,328273,good,headphone,604,435,169,596,74
424,328340,sound quality,headphone,402,292,110,393,63
2,328232,comfortable,headphone,382,328,54,373,62
30,328260,great sound,headphone,313,270,43,309,57
70,328307,music,headphone,305,243,62,293,53
2386,328357,ears,headphone,293,193,100,289,56


### running the manual curation

In [178]:
qattr_nn = sim_attr.get_similar_attributes('good fit')
qattr_nn

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
5,20,good sound,3.109022,328250,good sound,headphone,150,100,50,147,39,good fit
96,34,great product,2.65192,328264,great product,headphone,78,75,3,77,24,good fit
30,451,good sound quality,2.715785,328821,good sound quality,headphone,39,28,11,39,21,good fit
45,722,good quality,2.523376,329092,good quality,headphone,34,32,2,34,19,good fit
70,71,great sound quality,2.617246,328308,great sound quality,headphone,32,26,6,32,18,good fit
108,1134,great battery life,2.64386,329503,great battery life,headphone,18,14,4,18,9,good fit
2,1627,great fit,2.805354,329784,great fit,headphone,17,13,4,17,14,good fit
41,41,good pair,2.63661,328278,good pair,headphone,17,11,6,16,12,good fit
68,397,great bass,2.640036,328681,great bass,headphone,16,14,2,16,13,good fit
6,143,good bass,3.109315,328427,good bass,headphone,14,12,2,14,10,good fit


In [179]:
#qattr_nn.sort_values('neighbor_distances')
shortlist = qattr_nn[qattr_nn['neighbor_attr'].str.contains('fit')]
#shortlist = qattr_nn[qattr_nn['n_reviews'] >=2]
#shortlist = qattr_nn.head(4)
shortlist

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
2,1627,great fit,2.805354,329784,great fit,headphone,17,13,4,17,14,good fit
13,833,comfortable fit,2.736926,329203,comfortable fit,headphone,14,13,1,14,10,good fit
0,4157,good fit,0.0,332494,good fit,headphone,7,4,3,7,5,good fit
1,827,nice fit,2.763144,329197,nice fit,headphone,4,4,0,4,4,good fit
7,3192,poor fit,3.13659,331315,poor fit,headphone,3,0,3,3,2,good fit
9,2838,tight fit,3.274879,331175,tight fit,headphone,3,2,1,3,3,good fit
12,4326,excellent fit,2.628612,332770,excellent fit,headphone,2,2,0,2,2,good fit
8,3383,bad fit,3.223564,331507,bad fit,headphone,1,1,0,1,1,good fit
10,8184,decent fit,2.557285,335619,decent fit,headphone,1,1,0,1,1,good fit
3,3951,new fit,3.039167,332288,new fit,headphone,1,1,0,1,1,good fit


In [180]:
curated_attributes = pd.concat([curated_attributes, shortlist])

In [181]:
curated_attributes.shape

(386, 12)

In [182]:
curated_attributes.groupby('qphrase')['qphrase'].count()

qphrase
awesome bass                   15
battery life                    5
comfortable fit               122
effective noise cancelling    124
good fit                       11
good sound quality            100
is durable                      5
light Weight                    4
Name: qphrase, dtype: int64

### storing the manually curated terms in the database

In [183]:
curated_attributes.head()

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
0,451,good sound quality,0.0,328821,good sound quality,headphone,39,28,11,39,21,good sound quality
1,71,great sound quality,1.476568,328308,great sound quality,headphone,32,26,6,32,18,good sound quality
17,1229,excellent sound quality,1.897104,328744,excellent sound quality,headphone,19,19,0,19,12,good sound quality
21,195,amazing sound quality,1.903928,328479,amazing sound quality,headphone,13,11,2,13,12,good sound quality
16,623,poor sound quality,1.837685,328993,poor sound quality,headphone,9,1,8,8,8,good sound quality


In [184]:
curated_attributes.to_sql('shortlisted_attributes', con=conn, method='multi',
                            index=False, if_exists='append')

386

In [185]:
# API contenders
curated_attributes.sort_values('neighbor_distances').sort_values('n_reviews', ascending=False).groupby('qphrase')['phrase'].apply(list).reset_index()

Unnamed: 0,qphrase,phrase
0,awesome bass,"[great bass, good bass, strong bass, nice bass, heavy bass, big bass, decent bass, mild bass, great punchy bass, blissful bass, great bass response, great bass performance, great bass sound, reaso..."
1,battery life,"[battery life, volume control, price range, volume level, quality product]"
2,comfortable fit,"[good sound, good sound quality, great sound quality, excellent sound quality, comfortable fit, amazing sound quality, poor sound quality, great audio quality, good fit, good quality sound, high q..."
3,effective noise cancelling,"[good battery life, great noise cancelling, active noise cancellation, good noise cancellation, active noise cancelling, good noise isolation, great customer support, good noise cancelling, great ..."
4,good fit,"[great fit, comfortable fit, good fit, nice fit, poor fit, tight fit, excellent fit, bad fit, comfortable fit battery, new fit, decent fit]"
5,good sound quality,"[good sound quality, great sound quality, excellent sound quality, amazing sound quality, poor sound quality, great audio quality, good quality sound, high quality sound, decent sound quality, goo..."
6,is durable,"[works great, works good, sounds good, sounds excellent, gels tight]"
7,light Weight,"[light weight, light usage, light weight easy, light music]"


# Reference

### Getting key_phrases for product category

In [None]:
category='mouse'

In [None]:
key_phrases_query = \
    f'''
    SELECT KP.*, KS.n_reviews, KS.n_positive, KS.n_negative, KS.n_reviewers, KS.n_products
    FROM key_phrase_root KP, key_phrase_scores KS
    WHERE KP.key_phrase_id=KS.key_phrase_id
        and KP.category='{category}'
    '''
key_phrases = pd.read_sql(key_phrases_query, conn)

In [None]:
key_phrases.shape

(61048, 8)

In [None]:
key_phrases.head()

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,267286,wrists,mouse,125,102,23,122,49
1,267183,way,mouse,835,552,283,776,389
2,267184,price,mouse,3597,3161,436,3345,813
3,267185,high,mouse,79,47,32,76,62
4,267186,works,mouse,3117,2850,267,2790,681


## Build a nearest neighbor model for phrases

In [439]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [440]:
from sklearn.neighbors import NearestNeighbors

In [441]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [442]:
# get spacy vectors for attributes

phrase_vectors = []

key_phrases['phrase'].progress_apply(lambda p: phrase_vectors.append(nlp(p).vector))

  0%|          | 0/61048 [00:00<?, ?it/s]

0        None
1        None
2        None
3        None
4        None
         ... 
61043    None
61044    None
61045    None
61046    None
61047    None
Name: phrase, Length: 61048, dtype: object

In [443]:
phrase_vectors_arr = np.vstack(phrase_vectors)

In [444]:
phrase_vectors_arr.shape

(61048, 96)

In [445]:
nn_model = NearestNeighbors()
nn_model.fit(phrase_vectors_arr)

In [446]:
q = nlp('Easy to set up').vector

In [447]:
nn_model.kneighbors([q], n_neighbors=10, return_distance=False)

array([[25945, 21237,  9097, 27115, 35705, 53917, 33791, 26955, 15168,
        20056]])

### checking out the NN model

In [448]:
def get_nearest_attributes(attribute, k=5):
    q = nlp(attribute).vector
    neighbors = nn_model.kneighbors([q], n_neighbors=k, return_distance=False)
    return neighbors[0]

In [449]:
qterms = ['Easy to set up', 'Quality display', 
            'Good color quality', 'Quality build', 
            'Sound quality', 
            'Minimal glare', 'Lightweight', 
            'Good viewing angles', 'Fast']

In [450]:
for q in qterms:
    print(q)
    neighbors =  get_nearest_attributes(q, 10)
    display(key_phrases.iloc[neighbors])
    

Easy to set up


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
25945,293198,countour perfit mouse,mouse,1,1,0,1,1
21237,288454,small rf receiver,mouse,1,1,0,1,1
9097,275626,go-to gaming mouse,mouse,1,1,0,1,1
27115,294332,go-to repacement,mouse,1,1,0,1,1
35705,302924,replace battery light,mouse,1,0,1,1,1
53917,321099,travel friendly,mouse,1,1,0,1,1
33791,301023,wee rf link,mouse,1,1,0,1,1
26955,294186,go-to wireless combo setup,mouse,1,1,0,1,1
15168,282409,comfy nice looking mouse,mouse,1,0,1,1,1
20056,287307,nice feature set including freedom,mouse,1,1,0,1,1


Quality display


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
39208,306431,great value purchase,mouse,1,1,0,1,1
27163,294359,great package deal,mouse,2,2,0,2,2
56896,324115,great screen resolution,mouse,1,1,0,1,1
39445,306617,large screen tv,mouse,1,1,0,1,1
35029,302264,great screen protector,mouse,2,1,1,2,1
56932,324145,great resolution screen,mouse,1,1,0,1,1
4713,271981,gateway2000 anykey,mouse,1,1,0,1,1
21449,288640,great performance mouse,mouse,1,1,0,1,1
38858,306134,business buy,mouse,1,1,0,1,1
28023,295251,abd play,mouse,1,1,0,1,1


Good color quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
43282,310568,poor video quality,mouse,1,0,1,1,1
43082,310278,great color selection,mouse,1,1,0,1,1
5492,272693,great construction quality,mouse,1,1,0,1,1
26222,293432,good reception distance,mouse,1,1,0,1,1
34047,301279,good product quality,mouse,4,4,0,3,4
50638,317830,great quality value,mouse,1,1,0,1,1
49628,316862,nice entry level workstation,mouse,1,1,0,1,1
8225,274766,fine ergonomic shape,mouse,1,1,0,1,1
47324,314478,great combination keyboard,mouse,1,1,0,1,1
53782,320981,wrong color purple,mouse,1,1,0,1,1


Quality build


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
42121,309323,great quality build,mouse,2,2,0,2,2
2323,269063,quality build,mouse,8,8,0,5,8
1188,268452,good quality build,mouse,2,1,1,2,2
36855,304059,computer build,mouse,1,1,0,1,1
51187,318434,quality product build,mouse,1,1,0,1,1
44484,311595,new computer build,mouse,1,1,0,1,1
37341,304545,budget build,mouse,3,3,0,3,3
38858,306134,business buy,mouse,1,1,0,1,1
61030,326974,value buy,mouse,1,1,0,1,1
25665,292845,low quality build,mouse,2,0,2,2,2


Sound quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
60082,327363,terrible sound quality,mouse,1,0,1,1,1
33253,300485,terrible quality,mouse,5,0,5,5,4
55364,322609,plastic thickness,mouse,1,1,0,1,1
38448,305737,great picture quality,mouse,2,2,0,2,2
60999,326945,good sound quality,mouse,1,1,0,1,1
50638,317830,great quality value,mouse,1,1,0,1,1
45857,313094,video quality,mouse,2,0,2,2,2
59152,326384,great gateway keyboard,mouse,1,1,0,1,1
44358,311540,bad quality material,mouse,1,0,1,1,1
59789,327078,sound device,mouse,1,1,0,1,1


Minimal glare


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
42129,309331,positive feedback,mouse,1,1,0,1,1
39839,307118,minimal success,mouse,1,0,1,1,1
32389,299678,positive contact,mouse,1,1,0,1,1
20709,287891,noticeable change,mouse,1,1,0,1,1
2801,270068,huge trackball,mouse,1,1,0,1,1
29659,296920,great color exact,mouse,3,3,0,1,3
48570,315826,popular brand,mouse,1,1,0,1,1
58686,325864,dismal software,mouse,1,0,1,1,1
57529,324538,expensive garbage,mouse,1,0,1,1,1
60934,328168,colorful glare,mouse,1,1,0,1,1


Lightweight


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
22334,289598,mush,mouse,3,2,1,3,3
26463,293645,mk260,mouse,2,1,1,2,2
5736,272913,diamondback,mouse,2,1,1,2,2
26028,293273,usb mose,mouse,1,0,1,1,1
18050,285087,lcc330,mouse,1,0,1,1,1
22419,289676,mx110,mouse,1,0,1,1,1
28795,296057,bluetooth light,mouse,1,1,0,1,1
5117,272352,dad,mouse,44,41,3,42,37
2882,270144,glide,mouse,3,3,0,3,3
26584,293865,usb male,mouse,1,1,0,1,1


Good viewing angles


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
57254,324440,good viewing angles,mouse,1,1,0,1,1
40997,308188,deep stopping points,mouse,1,1,0,1,1
35069,301487,good tracking capabilities,mouse,1,0,1,1,1
28319,294664,good typing experience,mouse,1,1,0,1,1
39294,306104,basic typing tasks,mouse,1,1,0,1,1
20912,288173,smooth scrolling wheels,mouse,1,1,0,1,1
53607,320865,great typing experience,mouse,1,1,0,1,1
57984,324577,nice typing experience,mouse,1,1,0,1,1
18337,285556,personal gaming computers,mouse,1,0,1,1,1
42184,309374,cheap gaming things,mouse,1,0,1,1,1


Fast


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
54430,321615,dual,mouse,1,1,0,1,1
397,267686,fast,mouse,363,327,36,322,195
29127,296335,genuine,mouse,2,2,0,2,2
47780,314992,great fast,mouse,1,1,0,1,1
26752,294013,nice fast,mouse,1,1,0,1,1
11918,279107,little slow,mouse,24,18,6,23,21
15495,282702,little fat,mouse,2,1,1,2,2
5649,272837,little delicate,mouse,1,1,0,1,1
9759,276985,real,mouse,24,14,10,24,23
60131,327406,perfect little,mouse,1,1,0,1,1


## Trying with an unsupervised clustering model

NOTE: we can't use generic unsupervised clustering methods because we don't know how many clusters can be there. Instead we do the following


1. we first find a 5 or 10 nearest neighbors for each attribute
    - we then group/merge attributes together into clusters based on 
        - mean distance between neighbors across all neighbor sets
        
        
2.  we can cluster together, but also try to differentiate between positive and negative attribute clusters within the same semantic subspace
    - e.g. q = "Good color quality"
        - positive equivalent in subspace = "excellent color quality"
        - negative equivalent in subspace = "poor color quality"
    - OR:
        - THIS [just looking at counts now] --> within each cluster, we subgroup based on probability of occurrence in positive or negative reviews

### find nearest neighbors for each phrase

In [451]:
# keeping a list of attributes
attributes = key_phrases['phrase'].tolist()
key_phase_idx = key_phrases.index

In [452]:
key_phrases

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,267286,wrists,mouse,125,102,23,122,49
1,267183,way,mouse,835,552,283,776,389
2,267184,price,mouse,3597,3161,436,3345,813
3,267185,high,mouse,79,47,32,76,62
4,267186,works,mouse,3117,2850,267,2790,681
...,...,...,...,...,...,...,...,...
61043,326987,kvm extender,mouse,1,1,0,1,1
61044,326988,hyper x3,mouse,1,1,0,1,1
61045,326989,ergonomic anker,mouse,1,1,0,1,1
61046,326990,best iron man purchase,mouse,1,1,0,1,1


In [453]:
import random

In [454]:
# for each phrase, for it's nearest neighbors, let's comput the inter-neighbor distance
# this is taking too long. We'll use a sample of 10000 to compute mean distance

SAMPLE_SIZE = 10000

all_clusters_mean_distance = 0 

for i in tqdm(random.sample(range(len(attributes)), SAMPLE_SIZE)):
    qvec = phrase_vectors_arr[i]
    distances, neighbors = nn_model.kneighbors([qvec], n_neighbors=10)
    all_clusters_mean_distance += distances[0].mean()

all_clusters_mean_distance /= SAMPLE_SIZE

all_clusters_mean_distance

  0%|          | 0/10000 [00:00<?, ?it/s]

2.993533144438267

### priority order for cluster generation
- to generate clusters, we can't really go first come first serve, or random, because the clustering could then be inconsistent with reality
- we can try to start with the phrases that occur most frequently in reviews, and go down the ranked list imposed by popularity
    - this is also easier to explain

In [455]:
priority_order = key_phrases.sort_values(['n_reviewers', 'n_products'], ascending=False)['key_phrase_id'].index.tolist()

In [456]:
key_phrases.loc[priority_order[:10]]

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
19,267201,stars,mouse,27739,25113,2626,22539,1763
9,267191,mouse,mouse,25481,17690,7791,22345,1746
50,267232,great,mouse,12642,11530,1112,11187,1487
550,267305,good,mouse,6973,5630,1343,6246,1235
10,267192,keyboard,mouse,6006,4120,1886,5759,632
128,267417,great mouse,mouse,4668,4408,260,4181,728
37,267219,easy,mouse,3721,3478,243,3355,814
2,267184,price,mouse,3597,3161,436,3345,813
981,267392,hand,mouse,3400,2666,734,3097,681
4,267186,works,mouse,3117,2850,267,2790,681


### Phrases close to the given terms

In [457]:
qphrases = {
    'laptop': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast', 'long battery life', 'Noise level'],
    'monitor': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast'],
    'headphone': ['Good sound quality', 'comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'effective noise cancelling', 'Good bass', 'Call quality', 'Attractive'],
    'mouse': ['comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'Attractive', 'Noise level', 'Easy to clean'],
    'tv': ['Easy to use', 'Easy to set up', 'Sound quality', 'Fast input response', 'lightweight', 'Attractive', 'quality build', 'lacks durability', 'speed', 'craftsmanship', 'camera quality', 'remote quality']
}

### distance based clusters
- we add those phrases to clusters that are within the average intra-cluster distance
- THIS method is MORE efficient, and the terms seems quite nice

In [458]:

def nearest_neighbor_expansion(all_clusters_mean, max_neighbors, phrase_vectors_arr, attributes, nn, qphrase):
    # initialization
    qvec = nlp(qphrase).vector
    neighbor_distances, neighbor_indices = nn.kneighbors([qvec], n_neighbors=10)
    neighbor_indices = neighbor_indices[0]
    neighbor_distances =neighbor_distances[0]
    neighbor_attributes = [attributes[i] for i in neighbor_indices]
    cluster_mean_dist = neighbor_distances.mean()
    #print(cluster_mean_dist, all_clusters_mean)
    #print(neighbor_attributes)

    lc = 1      # increase in number of neighbors (10, 101, 10) to search for nearest neighor expansion
    while cluster_mean_dist < all_clusters_mean and lc < max_neighbors/lc + 1:
        # we need to find more neighbors for this phrase until the mean intra-cluster distance is greater than the mean
        q_idx = neighbor_indices
        #print(len(q_idx))
        qvec = phrase_vectors_arr[q_idx].mean(axis=0)
        #print(qvec)
        distances, neighbors = nn.kneighbors([qvec], n_neighbors=10*lc)
        distances = distances[0]
        neighbors = neighbors[0]
        for knn in range(len(neighbors)):
            #print(knn, neighbors[knn])
            if neighbors[knn] not in neighbor_indices:
                #print('added')
                neighbor_indices = np.append(neighbor_indices, neighbors[knn])
                neighbor_distances = np.append(neighbor_distances, distances[knn])
                neighbor_attributes.append(attributes[neighbors[knn]])
        cluster_mean_dist = neighbor_distances.mean()
        lc += 1

    mean_distance_occurrence_phrases = pd.DataFrame({
        'neighbor_idx': neighbor_indices,
        'neighbor_attr': neighbor_attributes,
        'neighbor_distances': neighbor_distances,
    })
    #print(cluster_mean_dist, all_clusters_mean, lc)
    return mean_distance_occurrence_phrases


In [459]:
qphrases[category]

['comfortable',
 'long battery life',
 'Easy to set up',
 'Easy to use',
 'Quality build',
 'Attractive',
 'Noise level',
 'Easy to clean']

In [460]:
qphrase_neighbors = nearest_neighbor_expansion(all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'attractive')
qphrase_neighbors.sort_values('neighbor_distances')

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances
0,4908,attractive,0.0
1,12797,selectable,3.822903
2,30753,angry,4.002241
3,41633,comfortasble,4.02102
4,752,effective,4.076833
5,9609,impressive,4.078627
6,26259,evident,4.336577
7,36779,generous,4.381159
8,6428,inadequate,4.40151
9,60319,inconspicuous,4.488169


In [461]:
qphrase_neighbors['key_phrase_id'] = qphrase_neighbors['neighbor_idx'].apply(lambda _row: key_phrases.iloc[_row]['key_phrase_id'])

In [462]:
qphrase_neighbors.merge(key_phrases, how='left', left_on='key_phrase_id', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
4,752,effective,4.076833,267969,effective,mouse,96,89,7,91,73
0,4908,attractive,0.0,272148,attractive,mouse,80,75,5,63,62
5,9609,impressive,4.078627,276849,impressive,mouse,30,27,3,27,28
2,30753,angry,4.002241,297607,angry,mouse,8,4,4,6,8
6,26259,evident,4.336577,293467,evident,mouse,2,1,1,2,2
7,36779,generous,4.381159,303983,generous,mouse,2,1,1,2,2
1,12797,selectable,3.822903,280029,selectable,mouse,1,1,0,1,1
3,41633,comfortasble,4.02102,308816,comfortasble,mouse,1,1,0,1,1
8,6428,inadequate,4.40151,273062,inadequate,mouse,1,0,1,1,1
9,60319,inconspicuous,4.488169,327582,inconspicuous,mouse,1,1,0,1,1


In [463]:
def get_similar_attributes(key_phrases, all_clusters_mean, max_neighbors, phrase_vectors_arr, attributes, nn_model, qphrase):
    qphrase_neighbors = nearest_neighbor_expansion(all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, qphrase)
    qphrase_neighbors['key_phrase_id'] = qphrase_neighbors['neighbor_idx'].apply(lambda _row: key_phrases.iloc[_row]['key_phrase_id'])
    qattr_nn = qphrase_neighbors.merge(key_phrases, how='left', left_on='key_phrase_id', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)
    qattr_nn['qphrase'] = qphrase
    return qattr_nn 

In [464]:
get_similar_attributes(key_phrases, all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'attractive')

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
4,752,effective,4.076833,267969,effective,mouse,96,89,7,91,73,attractive
0,4908,attractive,0.0,272148,attractive,mouse,80,75,5,63,62,attractive
5,9609,impressive,4.078627,276849,impressive,mouse,30,27,3,27,28,attractive
2,30753,angry,4.002241,297607,angry,mouse,8,4,4,6,8,attractive
6,26259,evident,4.336577,293467,evident,mouse,2,1,1,2,2,attractive
7,36779,generous,4.381159,303983,generous,mouse,2,1,1,2,2,attractive
1,12797,selectable,3.822903,280029,selectable,mouse,1,1,0,1,1,attractive
3,41633,comfortasble,4.02102,308816,comfortasble,mouse,1,1,0,1,1,attractive
8,6428,inadequate,4.40151,273062,inadequate,mouse,1,0,1,1,1,attractive
9,60319,inconspicuous,4.488169,327582,inconspicuous,mouse,1,1,0,1,1,attractive


# SCratch

In [268]:
for qphrase in monitor:
    qvec = nlp(qphrase).vector
    _, __n = nn.kneighbors([qvec], n_neighbors=10)
    print(__n)
    print(qphrase)
    print( [attributes[i] for i in __n[0]])

[[12020  7589 18877 21085 17309 23563 19171 19177 21558 28605]]
Easy to set up
['awesome built-in surround sound', 'go-to monitor', 'excellent widescreen hp monitor', 'menu buttons faulty', 'nice freestanding stand', 'lg support unable', 'excellent built-in webcam', 'cost effective', 'expensive af', 'ok works good']
[[ 3756 21111 10550  9048 21219 30179 20828 18312 12656 13617]]
Quality display
['great picture display', 'gtx980 strix', 'quality display', 'great quality display', 'computer display', 'resolution display', 'nice quality display', 'great screen resolution', 'wide screen display', 'color display']
[[24138 14376 28118 27779 11638  3104 21910 10059 22017  3669]]
Good color quality
['great color definition', 'poor color quality', 'poor color accuracy', 'great video quality', 'nice video quality', 'excellent color quality', 'nice color definition', 'great color production', 'poor color rendition', 'good video quality']
[[11460 13956 11194  3093 16329 18830 11510 13618 19582 165

In [278]:
qvec = nlp('durability').vector
_, __n = nn.kneighbors([qvec], n_neighbors=10)
print(__n)
print(qphrase)
print( [attributes[i] for i in __n[0]])

[[ 1799  5790   132  1164 26156 27804  1122 30006 23119  2687]]
Fast
['durability', 'violation', 'flexibility', 'portability', 'malfunction', 'continuity', 'device', 'cage', 'coloration', 'configuration']
