# In this notebook

We extend the latter half of notebook `16` for all 4 categories
- as in notebook `16`
    - we will derive a priority order of attributes
    - we will find similar attributes using nearest neighbor search
- in this notebook
    - we will investigate what the similar phrases look like for each of our categories
    - we will compute pos and neg sub-clusters 
        - we will estimate requisite probabilities
    - we will store the top few similar meaning phrases in the database to serve out using the API
    

In [40]:
import pandas as pd
import json

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 1000

In [41]:
from sqlalchemy import create_engine
import psycopg2 
import io

In [42]:
import os
import glob

In [43]:
import pickle

In [44]:
import numpy as np

# Load attributes from database

In [45]:
conn_string = 'postgresql+psycopg2://gabbydbuser:gabbyDBpass@localhost:5432/gabbyDB'

In [46]:
db = create_engine(conn_string)
conn = db.connect()

### Getting key_phrases for product category

In [47]:
category='monitor'

In [48]:
key_phrases_query = \
    f'''
    SELECT KP.*, KS.n_reviews, KS.n_positive, KS.n_negative, KS.n_reviewers, KS.n_products
    FROM key_phrase_root KP, key_phrase_scores KS
    WHERE KP.key_phrase_id=KS.key_phrase_id
        and KP.category='{category}'
    '''
key_phrases = pd.read_sql(key_phrases_query, conn)

In [49]:
key_phrases.shape

(31970, 8)

In [50]:
key_phrases.head()

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,0,awesome sound,monitor,1,1,0,1,1
1,1,price,monitor,2091,1894,197,2045,499
2,2,good range,monitor,7,6,1,7,7
3,3,season porch,monitor,1,1,0,1,1
4,4,planar lcd monitor,monitor,1,1,0,1,1


# Build a nearest neighbor model for phrases

In [51]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [52]:
from sklearn.neighbors import NearestNeighbors

In [53]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [54]:
# get spacy vectors for attributes

phrase_vectors = []

key_phrases['phrase'].progress_apply(lambda p: phrase_vectors.append(nlp(p).vector))

  0%|          | 0/31970 [00:00<?, ?it/s]

0        None
1        None
2        None
3        None
4        None
         ... 
31965    None
31966    None
31967    None
31968    None
31969    None
Name: phrase, Length: 31970, dtype: object

In [55]:
#phrase_vectors[0]

In [62]:
phrase_vectors_arr = np.vstack(phrase_vectors)

In [63]:
phrase_vectors_arr.shape

(31970, 96)

In [64]:
nn = NearestNeighbors()
nn.fit(phrase_vectors_arr)

In [65]:
q = nlp('Easy to set up').vector

In [66]:
nn.kneighbors([q], n_neighbors=10, return_distance=False)

array([[14107,  9993, 21055, 23027,  3529]])

### checking out the NN model

In [67]:
def get_nearest_attributes(attribute, k=5):
    q = nlp(attribute).vector
    neighbors = nn.kneighbors([q], n_neighbors=k, return_distance=False)
    return neighbors[0]

In [68]:
qterms = ['Easy to set up', 'Quality display', 
            'Good color quality', 'Quality build', 
            'Sound quality', 
            'Minimal glare', 'Lightweight', 
            'Good viewing angles', 'Fast']

In [69]:
for q in qterms:
    print(q)
    neighbors =  get_nearest_attributes(q, 10)
    display(key_phrases.iloc[neighbors])
    

Easy to set up


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
14107,14107,awesome built-in surround sound,monitor,1,1,0,1,1
9993,9993,go-to monitor,monitor,1,1,0,1,1
21055,21055,excellent widescreen hp monitor,monitor,1,1,0,1,1
23027,23027,menu buttons faulty,monitor,1,0,1,1,1
3529,3529,great monitor--with calibration,monitor,2,2,0,1,1
19229,19229,nice freestanding stand,monitor,1,1,0,1,1
25296,25296,lg support unable,monitor,1,0,1,1,1
21237,21237,excellent built-in webcam,monitor,1,1,0,1,1
21243,21243,cost effective,monitor,3,3,0,3,3
23518,23518,expensive af,monitor,1,1,0,1,1


Quality display


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
6264,6264,great picture display,monitor,1,1,0,1,1
23055,23055,gtx980 strix,monitor,1,1,0,1,1
12648,12648,quality display,monitor,6,5,1,6,5
11400,11400,great quality display,monitor,4,2,2,4,4
23138,23138,computer display,monitor,2,1,1,2,2
31917,31917,resolution display,monitor,1,0,1,1,1
22646,22646,nice quality display,monitor,1,1,0,1,1
20162,20162,great screen resolution,monitor,1,1,0,1,1
14764,14764,wide screen display,monitor,1,1,0,1,1
15622,15622,color display,monitor,3,2,1,3,3


Good color quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
26043,26043,great color definition,monitor,1,0,1,1,1
16223,16223,poor color quality,monitor,2,0,2,2,2
29859,29859,poor color accuracy,monitor,1,1,0,1,1
29512,29512,great video quality,monitor,1,1,0,1,1
13636,13636,nice video quality,monitor,2,2,0,2,2
5892,5892,excellent color quality,monitor,1,1,0,1,1
23953,23953,nice color definition,monitor,1,1,0,1,1
12252,12252,great color production,monitor,1,1,0,1,1
23741,23741,poor color rendition,monitor,2,0,2,2,2
6187,6187,good video quality,monitor,5,4,1,5,5


Quality build


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
13463,13463,quality build,monitor,4,4,0,4,4
16007,16007,good quality build,monitor,1,1,0,1,1
13242,13242,new system build,monitor,2,2,0,2,2
5882,5882,pc build,monitor,2,2,0,2,2
18560,18560,mst setting,monitor,1,1,0,1,1
20987,20987,gaming build,monitor,1,1,0,1,1
13558,13558,nice build,monitor,1,0,1,1,1
15623,15623,new desktop build,monitor,1,0,1,1,1
21576,21576,grandma pc build,monitor,1,1,0,1,1
18425,18425,dust cover,monitor,1,1,0,1,1


Sound quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
1321,1321,great sound quality,monitor,3,3,0,3,3
30496,30496,sharpness quality,monitor,1,1,0,1,1
702,702,excellent sound quality,monitor,1,1,0,1,1
1040,1040,terrible quality,monitor,4,1,3,4,4
1539,1539,audiophile quality,monitor,1,1,0,1,1
29512,29512,great video quality,monitor,1,1,0,1,1
4448,4448,great picture quality,monitor,52,48,4,52,37
728,728,good sound quality,monitor,5,5,0,5,4
30864,30864,great sound bar,monitor,1,1,0,1,1
16229,16229,bad sound quality,monitor,2,0,2,2,2


Minimal glare


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
9156,9156,minimal glare,monitor,1,1,0,1,1
23772,23772,noticeable impact,monitor,1,1,0,1,1
29043,29043,positive experience,monitor,1,1,0,1,1
10211,10211,minimal work,monitor,1,1,0,1,1
29780,29780,specific space,monitor,1,1,0,1,1
21106,21106,noticeable strain,monitor,1,1,0,1,1
13841,13841,great glossy monitor,monitor,1,1,0,1,1
25715,25715,actual assembly,monitor,1,0,1,1,1
5768,5768,professional work,monitor,2,2,0,2,2
30803,30803,magnificent view,monitor,1,1,0,1,1


Lightweight


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
10158,10158,s220hql,monitor,1,1,0,1,1
29822,29822,deployment,monitor,1,1,0,1,1
13019,13019,g206hql,monitor,1,0,1,1,1
22290,22290,monitory,monitor,2,2,0,2,2
6121,6121,dad,monitor,19,17,2,19,18
30502,30502,usb male,monitor,1,0,1,1,1
2245,2245,inclusion,monitor,1,0,1,1,1
12269,12269,motherboard,monitor,6,4,2,6,6
4426,4426,lo traje,monitor,1,1,0,1,1
27607,27607,tripod mound,monitor,1,1,0,1,1


Good viewing angles


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
17663,17663,good viewing angles,monitor,5,5,0,5,4
14299,14299,great viewing angles,monitor,8,8,0,8,7
10068,10068,nice viewing angles,monitor,1,1,0,1,1
12731,12731,terrible viewing angles,monitor,4,0,4,4,3
12329,12329,poor viewing angles,monitor,9,3,6,9,5
12599,12599,abysmal viewing angles,monitor,1,1,0,1,1
22439,22439,superior viewing angles,monitor,1,1,0,1,1
5759,5759,different viewing angles,monitor,2,2,0,2,2
27076,27076,extreme viewing angles,monitor,1,1,0,1,1
16914,16914,decent viewing angles,monitor,2,1,1,2,2


Fast


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
552,552,dual,monitor,28,22,6,28,20
347,347,fast,monitor,56,56,0,56,51
30021,30021,genuine,monitor,1,1,0,1,1
25075,25075,fast inexpensive,monitor,1,0,1,1,1
9949,9949,little slow,monitor,4,4,0,4,3
8157,8157,real,monitor,12,8,4,12,12
23794,23794,macbook pro late,monitor,1,1,0,1,1
11655,11655,separate hdmi,monitor,1,1,0,1,1
2588,2588,long,monitor,30,27,3,30,28
26201,26201,little pricey,monitor,1,1,0,1,1


# Trying with an unsupervised clustering model

NOTE: we can't use generic unsupervised clustering methods because we don't know how many clusters can be there. Instead we do the following


1. we first find a 5 or 10 nearest neighbors for each attribute
    - we then group/merge attributes together into clusters based on 
        - mean distance between neighbors across all neighbor sets
        
        
2.  we can cluster together, but also try to differentiate between positive and negative attribute clusters within the same semantic subspace
    - e.g. q = "Good color quality"
        - positive equivalent in subspace = "excellent color quality"
        - negative equivalent in subspace = "poor color quality"
    - OR:
        - within each cluster, we subgroup based on probability of occurrence in positive or negative reviews

## find nearest neighbors for each phrase

In [72]:
attributes = key_phrases['phrase'].tolist()
neighbor_clusters = []
for index, phrase in tqdm(key_phrases['phrase'].items()):
    qvec = phrase_vectors_arr[index]
    distances, neighbors = nn.kneighbors([qvec], n_neighbors=10)
    neighbor_clusters.append({
        'phrase_idx': index,
        'phrase': phrase,
        'neighbor_idx': neighbors[0],
        'neighbor_dist': distances[0],
        'neighbor_attr': [attributes[i] for i in neighbors[0]]
    })
    
    
    

0it [00:00, ?it/s]

In [73]:
neighbor_clusters[10]

{'phrase_idx': 10,
 'phrase': 'excellent picture',
 'neighbor_idx': array([   10,  1078,  5674, 29308, 27967, 19328,  8959, 26626,  1044,
        26085]),
 'neighbor_dist': array([0.       , 2.3182387, 2.3505652, 2.3875034, 2.3893127, 2.5184553,
        2.5446901, 2.5491073, 2.5552864, 2.642936 ], dtype=float32),
 'neighbor_attr': ['excellent picture',
  'decent picture',
  'excellent condition',
  'excellent construction',
  'immersive picture',
  'excellent image',
  'excellent functionality',
  'excellent system',
  'excellent product',
  'bautiful picture']}

## priority order for cluster generation
- to generate clusters, we can't really go first come first serve, or random, because the clustering could then be inconsistent with reality
- we can try to start with the phrases that occur most frequently in reviews, and go down the ranked list imposed by popularity
    - this is also easier to explain

In [74]:
priority_order = key_phrases.sort_values(['n_reviewers', 'n_products'], ascending=False)['key_phrase_id'].index.tolist()

In [75]:
key_phrases.loc[priority_order[:10]]

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
44,44,stars,monitor,7510,6984,526,7005,758
19,19,monitor,monitor,6533,4900,1633,6224,714
59,59,great,monitor,3616,3359,257,3505,671
36,36,good,monitor,2387,1983,404,2303,584
441,441,great monitor,monitor,2240,2130,110,2189,424
1,1,price,monitor,2091,1894,197,2045,499
495,495,screen,monitor,2048,1428,620,1969,556
47,47,easy,monitor,1126,1068,58,1103,388
32,32,nice,monitor,1069,962,107,1040,415
1045,1045,monitors,monitor,993,752,241,970,301


## distance based clusters
- we add those phrases to clusters that are within the average intra-cluster distance
- THIS method is MORE efficient, and the terms seems quite nice

In [76]:
cluster_dists = np.zeros(len(priority_order))
for nc in neighbor_clusters:
    i = nc['phrase_idx']
    cluster_dists[i] = nc['neighbor_dist'].mean()

In [77]:
all_clusters_mean = cluster_dists.mean()
all_clusters_mean

3.1550051269361217

In [78]:
import plotly.express as px

In [79]:
px.histogram(cluster_dists)

In [80]:
np.median(cluster_dists)

2.9473187923431396

In [259]:
phrase_ids = []
phrases = []
nns = []
new_nns = []


for p in tqdm(priority_order):
    nc = neighbor_clusters[p]
    phrase_ids.append(p)
    phrases.append(nc['phrase'])
    nns.append(nc['neighbor_attr'])
    #print(nc['neighbor_idx'].shape)
    
    cluster_mean_dist = nc['neighbor_dist'].mean()
    lc = 1      # increase in number of neighbors (10, 101, 20) to search for nearest neighor expansion
    while cluster_mean_dist < all_clusters_mean and lc < 11:
        #print(cluster_mean_dist, all_clusters_mean)
        # we need to find more neighbors for this phrase until the mean intra-cluster distance is greater than the mean
        q_idx = nc['neighbor_idx']
        #print(q_idx)
        qvec = phrase_vectors_arr[q_idx].mean(axis=0)
        #print(qvec)
        distances, neighbors = nn.kneighbors([qvec], n_neighbors=10*lc)
        distances = distances[0]
        neighbors = neighbors[0]
        for knn in range(len(neighbors)):
            #print(knn, neighbors[knn])
            if neighbors[knn] not in nc['neighbor_idx']:
                #print('added')
                nc['neighbor_idx'] = np.append(nc['neighbor_idx'], neighbors[knn])
                nc['neighbor_dist'] = np.append(nc['neighbor_dist'], distances[knn])
        nc['neighbor_attr'] =  [attributes[i] for i in neighbors]
        cluster_mean_dist = nc['neighbor_dist'].mean()
        lc += 1

    new_nns.append(key_phrases.iloc[nc['neighbor_idx'].reshape(-1)]['phrase'].tolist())
    #print(nc['neighbor_idx'].shape)

mean_distance_occurrence_phrases = pd.DataFrame({
    'phrase_id': phrase_ids,
    'phrase': phrases,
    'neighbors': nns,
    'phrases_having_similar_neighbors': new_nns
})

  0%|          | 0/30231 [00:00<?, ?it/s]

In [269]:
mean_distance_occurrence_phrases.head(25)

Unnamed: 0,phrase_id,phrase,neighbors,phrases_having_similar_neighbors
0,3,stars,"[stars, replacements, presets, bolts, choices, vents, references, passengers, fingerprints, headsets]","[stars, replacements, presets, bolts, choices, vents, references, passengers, fingerprints, headsets]"
1,18,monitor,"[monitor, snap, hmdi, sleeve, displayport, psa, intercom, confuse, raspberry, manufacture]","[monitor, snap, hmdi, sleeve, displayport, psa, intercom, confuse, raspberry, manufacture]"
2,12,great,"[great, big, horrendous, large, durable, popular, enormous, fit great, dangerous, esp]","[great, big, horrendous, large, durable, popular, enormous, fit great, dangerous, esp]"
3,15,good,"[good, gorgeous, peerless, fond, special, indoor, fantastic, endless, superior, great]","[good, gorgeous, peerless, fond, special, indoor, fantastic, endless, superior, great]"
4,229,great monitor,"[spacious monitor, specific monitor, impressive monitor, terrible monitor, temporary monitor, big monitor, clever monitor, dependable monitor, sharp monitor, superior monitor, horrible monitor, in...","[great monitor, big monitor, handy monitor, different monitor, good monitor, expensive monitor, large monitor, nice monitor, real monitor, huge monitor, terrible monitor, actual monitor, impressiv..."
5,114,screen,"[screen, grass, boot, glass, bathroom, paperwork, btightness, driveway, game, calibration]","[screen, grass, boot, glass, bathroom, paperwork, btightness, driveway, game, calibration]"
6,36,price,"[price, persistence, policy, dissatisfaction, cage, voltage, tank, portability, violation, platinum]","[price, persistence, policy, dissatisfaction, cage, voltage, tank, portability, violation, platinum]"
7,1,easy,"[easy, snappy, becareful, legible, electrical, incomplete, horrible, beautiful, noticeable, visible]","[easy, snappy, becareful, legible, electrical, incomplete, horrible, beautiful, noticeable, visible]"
8,606,nice,"[nice, inky, new, mandatory, malicious, genuine, great, wonderful, big, clear nice]","[nice, inky, new, mandatory, malicious, genuine, great, wonderful, big, clear nice]"
9,99,monitors,"[prints, endorsements, ants, televisions, contents, meetings, monitors, productions, products, lines, limitations, properties, vents, directions, offices, questions, choices, peripherals, movies, ...","[monitors, movies, prints, simulations, productions, differences, terms, perks, expeditions, choices, benefits, endorsements, televisions, meetings, questions, places, lines, limitations, ants, pi..."


In [263]:
mean_distance_occurrence_phrases.to_pickle('mean_distance_occurrence_phrases.pkl')

In [257]:
mean_distance_occurrence_phrases['phrases_having_similar_neighbors'].values

array([list(['great monitor', 'big monitor', 'handy monitor', 'different monitor', 'good monitor', 'expensive monitor', 'large monitor', 'nice monitor', 'real monitor', 'huge monitor', 'terrible monitor', 'actual monitor', 'impressive monitor', 'sharp monitor', 'new monitor', 'temporary monitor', 'durable monitor', 'specific monitor', 'beautiful monitor', 'small monitor', 'remarkable monitor', 'spacious monitor', 'dependable monitor', 'clever monitor', 'heavy monitor', 'extraordinary monitor', 'superior monitor', 'competitive monitor', 'reasonable monitor', 'incredible monitor', 'fabulous monitor', 'functional monitor', 'wondrous monitor', 'horrible monitor', 'bad monitor', 'wonderful monitor', 'sick monitor', 'true monitor', 'professional monitor', 'wrong monitor', 'decent monitor', 'original monitor', 'old monitor', 'practical monitor', 'good inexpensive monitor', 'ergonomic monitor', 'modern monitor', 'inexpensive monitor', 'magnificent monitor', 'fantastic monitor', 'adjustable mon

# Phrases close to the given terms

In [260]:
laptop = 'Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast', 'long battery life', 'Noise level'
monitor = 'Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast'
headphone = 'Good sound quality', 'comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'effective noise cancelling', 'Good bass', 'Call quality', 'Attractive'
mouse = 'comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'Attractive', 'Noise level', 'Easy to clean'
tv ='Easy to use', 'Easy to set up', 'Sound quality', 'Fast input response', 'lightweight', 'Attractive', 'quality build', 'lacks durability', 'speed', 'craftsmanship', 'camera quality', 'remote quality'

In [119]:

def nearest_neighbor_expansion(all_clusters_mean, max_neighbors, phrase_vectors_arr, attributes, nn, qphrase):
    # initialization
    qvec = nlp(qphrase).vector
    neighbor_distances, neighbor_indices = nn.kneighbors([qvec], n_neighbors=10)
    neighbor_indices = neighbor_indices[0]
    neighbor_distances =neighbor_distances[0]
    neighbor_attributes = [attributes[i] for i in neighbor_indices]
    cluster_mean_dist = neighbor_distances.mean()
    #print(cluster_mean_dist, all_clusters_mean)
    #print(neighbor_attributes)

    lc = 1      # increase in number of neighbors (10, 101, 10) to search for nearest neighor expansion
    while cluster_mean_dist < all_clusters_mean and lc < max_neighbors/lc + 1:
        # we need to find more neighbors for this phrase until the mean intra-cluster distance is greater than the mean
        q_idx = neighbor_indices
        #print(len(q_idx))
        qvec = phrase_vectors_arr[q_idx].mean(axis=0)
        #print(qvec)
        distances, neighbors = nn.kneighbors([qvec], n_neighbors=10*lc)
        distances = distances[0]
        neighbors = neighbors[0]
        for knn in range(len(neighbors)):
            #print(knn, neighbors[knn])
            if neighbors[knn] not in neighbor_indices:
                #print('added')
                neighbor_indices = np.append(neighbor_indices, neighbors[knn])
                neighbor_distances = np.append(neighbor_distances, distances[knn])
                neighbor_attributes.append(attributes[neighbors[knn]])
        cluster_mean_dist = neighbor_distances.mean()
        lc += 1

    mean_distance_occurrence_phrases = pd.DataFrame({
        'neighbor_idx': neighbor_indices,
        'neighbor_attr': neighbor_attributes,
        'neighbor_distances': neighbor_distances,
    })
    #print(cluster_mean_dist, all_clusters_mean, lc)
    return mean_distance_occurrence_phrases


In [144]:
qphrase_neighbors = nearest_neighbor_expansion(all_clusters_mean, 100, phrase_vectors_arr, attributes, nn, 'screen')
qphrase_neighbors.sort_values('neighbor_distances')

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances
0,495,screen,0.0
1,30486,grass,4.048709
2,219,boot,4.113596
3,5916,glass,4.176819
4,29847,bathroom,4.244409
5,20848,paperwork,4.249258
6,10022,btightness,4.284009
7,11575,driveway,4.303878
8,2502,game,4.386589
9,2149,calibration,4.390345


In [145]:
qphrase_neighbors.merge(key_phrases, how='left', left_on='neighbor_idx', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,495,screen,0.0,495,screen,monitor,2048,1428,620,1969,556
8,2502,game,4.386589,2502,game,monitor,70,59,11,67,54
9,2149,calibration,4.390345,2149,calibration,monitor,62,53,9,56,47
3,5916,glass,4.176819,5916,glass,monitor,21,15,6,21,20
2,219,boot,4.113596,219,boot,monitor,6,5,1,6,6
7,11575,driveway,4.303878,11575,driveway,monitor,2,2,0,2,2
1,30486,grass,4.048709,30486,grass,monitor,1,1,0,1,1
4,29847,bathroom,4.244409,29847,bathroom,monitor,1,1,0,1,1
5,20848,paperwork,4.249258,20848,paperwork,monitor,1,1,0,1,1
6,10022,btightness,4.284009,10022,btightness,monitor,1,1,0,1,1


# Select curated terms
3 step process
1. pick those phrases that generate better looking terms; note the top ranking threshold for each phrase if needed
2. pick good looking ones from priority_order list; not the top ranking threshold for each phrase if needed
3. save the manually curated phrases and their respective top similar phrases into db



# SCratch

In [268]:
for qphrase in monitor:
    qvec = nlp(qphrase).vector
    _, __n = nn.kneighbors([qvec], n_neighbors=10)
    print(__n)
    print(qphrase)
    print( [attributes[i] for i in __n[0]])

[[12020  7589 18877 21085 17309 23563 19171 19177 21558 28605]]
Easy to set up
['awesome built-in surround sound', 'go-to monitor', 'excellent widescreen hp monitor', 'menu buttons faulty', 'nice freestanding stand', 'lg support unable', 'excellent built-in webcam', 'cost effective', 'expensive af', 'ok works good']
[[ 3756 21111 10550  9048 21219 30179 20828 18312 12656 13617]]
Quality display
['great picture display', 'gtx980 strix', 'quality display', 'great quality display', 'computer display', 'resolution display', 'nice quality display', 'great screen resolution', 'wide screen display', 'color display']
[[24138 14376 28118 27779 11638  3104 21910 10059 22017  3669]]
Good color quality
['great color definition', 'poor color quality', 'poor color accuracy', 'great video quality', 'nice video quality', 'excellent color quality', 'nice color definition', 'great color production', 'poor color rendition', 'good video quality']
[[11460 13956 11194  3093 16329 18830 11510 13618 19582 165

In [278]:
qvec = nlp('durability').vector
_, __n = nn.kneighbors([qvec], n_neighbors=10)
print(__n)
print(qphrase)
print( [attributes[i] for i in __n[0]])

[[ 1799  5790   132  1164 26156 27804  1122 30006 23119  2687]]
Fast
['durability', 'violation', 'flexibility', 'portability', 'malfunction', 'continuity', 'device', 'cage', 'coloration', 'configuration']
