# In this notebook

We extend the latter half of notebook `16` for all 4 categories
- as in notebook `16`
    - we will derive a priority order of attributes
    - we will find similar attributes using nearest neighbor search
- in this notebook
    - we will investigate what the similar phrases look like for each of our categories
    - we will compute pos and neg sub-clusters 
        - we will estimate requisite probabilities
    - we will store the top few similar meaning phrases in the database to serve out using the API
    

In [1]:
import pandas as pd
import json

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 1000

In [2]:
from sqlalchemy import create_engine
import psycopg2 
import io

In [3]:
import os
import glob

In [4]:
import pickle

In [5]:
import numpy as np

# Load attributes from database

In [6]:
conn_string = 'postgresql+psycopg2://gabbydbuser:gabbyDBpass@localhost:5432/gabbyDB'

In [7]:
db = create_engine(conn_string)
conn = db.connect()

### Getting key_phrases for product category

In [8]:
category='tv'

In [9]:
key_phrases_query = \
    f'''
    SELECT KP.*, KS.n_reviews, KS.n_positive, KS.n_negative, KS.n_reviewers, KS.n_products
    FROM key_phrase_root KP, key_phrase_scores KS
    WHERE KP.key_phrase_id=KS.key_phrase_id
        and KP.category='{category}'
    '''
key_phrases = pd.read_sql(key_phrases_query, conn)

In [10]:
key_phrases.shape

(112583, 8)

In [11]:
key_phrases.head()

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,31972,mount,tv,3441,2681,760,2915,482
1,31977,nuts,tv,133,89,44,113,79
2,31980,price,tv,6881,6193,688,6213,1745
3,31981,degrees,tv,172,127,45,154,116
4,31988,bolt,tv,113,84,29,98,72


# Build a nearest neighbor model for phrases

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
from sklearn.neighbors import NearestNeighbors

In [14]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [15]:
# get spacy vectors for attributes

phrase_vectors = []

key_phrases['phrase'].progress_apply(lambda p: phrase_vectors.append(nlp(p).vector))

  0%|          | 0/112583 [00:00<?, ?it/s]

0         None
1         None
2         None
3         None
4         None
          ... 
112578    None
112579    None
112580    None
112581    None
112582    None
Name: phrase, Length: 112583, dtype: object

In [16]:
phrase_vectors_arr = np.vstack(phrase_vectors)

In [17]:
phrase_vectors_arr.shape

(112583, 96)

In [18]:
nn_model = NearestNeighbors()
nn_model.fit(phrase_vectors_arr)

In [19]:
q = nlp('Easy to set up').vector

In [20]:
nn_model.kneighbors([q], n_neighbors=10, return_distance=False)

array([[ 70745, 106598,   4032,  40542,  63316, 102373,  41103,  70089,
         80089,  59705]])

### checking out the NN model

In [24]:
def get_nearest_attributes(attribute, k=5):
    q = nlp(attribute).vector
    neighbors = nn_model.kneighbors([q], n_neighbors=k, return_distance=False)
    return neighbors[0]

In [25]:
qterms = ['Easy to set up', 'Quality display', 
            'Good color quality', 'Quality build', 
            'Sound quality', 
            'Minimal glare', 'Lightweight', 
            'Good viewing angles', 'Fast']

In [26]:
for q in qterms:
    print(q)
    neighbors =  get_nearest_attributes(q, 10)
    display(key_phrases.iloc[neighbors])
    

Easy to set up


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
70745,90165,gotta read,tv,1,1,0,1,1
106598,120533,best sounding rf headset,tv,1,1,0,1,1
4032,48161,lb load fine,tv,1,1,0,1,1
40542,81249,cheaper wd tv live,tv,1,0,1,1,1
63316,60080,hdmi cable included--be sure,tv,1,0,1,1,1
102373,103519,kdlinks built-in movie app,tv,1,1,0,1,1
41103,83497,eat-in kitchens,tv,1,1,0,1,1
70089,87473,par built-in sound,tv,1,0,1,1,1
80089,127671,free channels albany ny area,tv,1,1,0,1,1
59705,45411,gb wd greenpower hard drive,tv,2,2,0,1,1


Quality display


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
90226,55087,computer display,tv,8,8,0,7,8
28236,144343,resolution display,tv,1,0,1,1,1
41518,85108,great value pack,tv,2,2,0,2,2
41532,85176,great value bundle,tv,1,1,0,1,1
15031,91848,quality drop,tv,2,1,1,2,2
10534,73643,excellent value plasma,tv,1,1,0,1,1
97666,84835,great package deal,tv,3,3,0,3,3
20589,114200,great volume output,tv,1,1,0,1,1
99361,91407,color display,tv,2,1,1,2,2
94131,70668,information display,tv,1,0,1,1,1


Good color quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
8513,65935,great video quality,tv,5,5,0,5,5
67771,78218,nice video quality,tv,1,1,0,1,1
77989,119080,great product quality,tv,2,2,0,2,2
2382,41455,good video quality,tv,9,7,2,9,9
72285,96250,nice cable quality,tv,1,1,0,1,1
68898,82696,great color saturation,tv,2,1,1,1,2
110338,135466,great quality product awesome price,tv,1,1,0,1,1
111346,139562,great product value,tv,1,1,0,1,1
91613,60630,poor video quality,tv,6,1,5,6,5
35203,59838,great color balance,tv,2,2,0,2,2


Quality build


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
44505,97015,nice quality build,tv,1,1,0,1,1
58728,41662,great quality build,tv,4,4,0,3,4
924,35684,quality build,tv,16,15,1,14,15
41377,84586,good quality build,tv,6,6,0,6,6
55881,142676,poor quality build,tv,1,0,1,1,1
30309,39993,great product buy,tv,1,1,0,1,1
23638,126262,new htpc build,tv,1,0,1,1,1
5359,53305,level build,tv,1,1,0,1,1
35386,60572,high quality build,tv,3,3,0,3,3
72280,96219,cable build,tv,1,1,0,1,1


Sound quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
23381,125243,great product sound quality,tv,2,2,0,1,2
91222,59100,terrible sound quality,tv,5,3,2,3,5
5430,53573,great sound quality,tv,40,37,3,38,33
11940,79258,fine sound quality,tv,2,2,0,2,2
83340,140411,great material quality,tv,1,1,0,1,1
104679,112698,good output sound quality,tv,1,1,0,1,1
48305,112129,decent sound quality,tv,6,4,2,6,5
30673,41508,excellent sound quality,tv,26,23,3,24,25
59471,44461,terrible quality,tv,9,0,9,8,8
21851,119235,atrocious sound quality,tv,1,0,1,1,1


Minimal glare


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
62713,57613,positive feedback,tv,6,4,2,6,6
98018,86183,positive experience,tv,3,3,0,2,3
55414,140605,excellent dish,tv,1,1,0,1,1
62322,56068,great dish,tv,2,2,0,2,2
2490,41938,great custoemr service,tv,2,2,0,1,2
46644,105645,noticeable decline,tv,1,1,0,1,1
73696,101898,minimal lack,tv,1,1,0,1,1
83922,142683,specific space,tv,1,1,0,1,1
40904,82675,noticeable edge,tv,1,1,0,1,1
43970,95021,minimal slack,tv,1,1,0,1,1


Lightweight


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
82369,136610,tcls3800,tv,1,0,1,1,1
76068,111397,thanku,tv,1,1,0,1,1
48913,114612,vx11,tv,1,1,0,1,1
11805,78694,150mbps,tv,2,1,1,2,2
32523,49046,great ota,tv,1,1,0,1,1
106281,119224,n9z-00001,tv,1,1,0,1,1
34986,59002,netflex,tv,10,9,1,10,7
101362,99310,usb ext,tv,1,0,1,1,1
88259,47459,mirage,tv,1,1,0,1,1
76022,111207,320kbps,tv,1,0,1,1,1


Good viewing angles


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
67438,76828,good viewing angles,tv,3,2,1,3,3
28355,32219,great viewing angles,tv,7,7,0,6,6
21771,118951,terrible viewing angles,tv,1,0,1,1,1
66558,73096,poor viewing angles,tv,4,2,2,3,4
102077,102308,sharp viewing angles,tv,1,1,0,1,1
67007,75064,different viewing angles,tv,2,2,0,2,2
50733,121871,adjustable viewing angles,tv,2,2,0,2,2
94628,72716,amazing viewing angles,tv,1,1,0,1,1
78557,121323,multiple viewing angles,tv,3,3,0,3,3
59733,45523,great turning angles,tv,2,2,0,1,1


Fast


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
86726,41337,dual,tv,6,4,2,6,6
28979,34750,fast,tv,321,279,42,303,245
7943,63617,vac,tv,1,1,0,1,1
65626,69452,coax stripper,tv,1,1,0,1,1
106584,120471,coax direct,tv,1,0,1,1,1
30941,42540,heavy sid,tv,2,2,0,1,1
71660,93755,genuine,tv,2,2,0,2,2
41199,83920,great fast,tv,1,1,0,1,1
91207,59022,compact little,tv,1,0,1,1,1
108649,128663,staticky sound,tv,1,0,1,1,1


# Trying with an unsupervised clustering model

NOTE: we can't use generic unsupervised clustering methods because we don't know how many clusters can be there. Instead we do the following


1. we first find a 5 or 10 nearest neighbors for each attribute
    - we then group/merge attributes together into clusters based on 
        - mean distance between neighbors across all neighbor sets
        
        
2.  we can cluster together, but also try to differentiate between positive and negative attribute clusters within the same semantic subspace
    - e.g. q = "Good color quality"
        - positive equivalent in subspace = "excellent color quality"
        - negative equivalent in subspace = "poor color quality"
    - OR:
        - THIS [just looking at counts now] --> within each cluster, we subgroup based on probability of occurrence in positive or negative reviews

## find nearest neighbors for each phrase

In [27]:
# keeping a list of attributes
attributes = key_phrases['phrase'].tolist()
key_phase_idx = key_phrases.index

In [29]:
key_phrases

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,31972,mount,tv,3441,2681,760,2915,482
1,31977,nuts,tv,133,89,44,113,79
2,31980,price,tv,6881,6193,688,6213,1745
3,31981,degrees,tv,172,127,45,154,116
4,31988,bolt,tv,113,84,29,98,72
...,...,...,...,...,...,...,...,...
112578,144527,test devices,tv,1,1,0,1,1
112579,144538,random typing,tv,1,1,0,1,1
112580,144539,cheap adhesive,tv,1,0,1,1,1
112581,144546,great gig,tv,1,1,0,1,1


In [34]:
import random

In [36]:
# for each phrase, for it's nearest neighbors, let's comput the inter-neighbor distance
# this is taking too long. We'll use a sample of 10000 to compute mean distance

SAMPLE_SIZE = 10000

all_clusters_mean_distance = 0 

for i in tqdm(random.sample(range(len(attributes)), SAMPLE_SIZE)):
    qvec = phrase_vectors_arr[i]
    distances, neighbors = nn_model.kneighbors([qvec], n_neighbors=10)
    all_clusters_mean_distance += distances[0].mean()

all_clusters_mean_distance /= SAMPLE_SIZE

all_clusters_mean_distance

  0%|          | 0/10000 [00:00<?, ?it/s]

2.9114937198519706

## priority order for cluster generation
- to generate clusters, we can't really go first come first serve, or random, because the clustering could then be inconsistent with reality
- we can try to start with the phrases that occur most frequently in reviews, and go down the ranked list imposed by popularity
    - this is also easier to explain

In [38]:
priority_order = key_phrases.sort_values(['n_reviewers', 'n_products'], ascending=False)['key_phrase_id'].index.tolist()

In [39]:
key_phrases.loc[priority_order[:10]]

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
28293,31984,stars,tv,47669,43195,4474,36863,3845
56331,32024,great,tv,20756,19392,1364,18351,3140
84393,31997,easy,tv,11644,11200,444,10177,1915
84411,32068,good,tv,11475,9342,2133,10165,2580
2,31980,price,tv,6881,6193,688,6213,1745
56335,32049,works,tv,5247,4800,447,4796,1570
19,32059,product,tv,5174,3559,1615,4591,1741
56327,32006,wall,tv,4873,4060,813,4148,835
56679,33349,remote,tv,4305,3222,1083,4135,981
84415,32085,perfect,tv,4593,4503,90,4094,1525


# Phrases close to the given terms

In [40]:
qphrases = {
    'laptop': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast', 'long battery life', 'Noise level'],
    'monitor': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast'],
    'headphone': ['Good sound quality', 'comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'effective noise cancelling', 'Good bass', 'Call quality', 'Attractive'],
    'mouse': ['comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'Attractive', 'Noise level', 'Easy to clean'],
    'tv': ['Easy to use', 'Easy to set up', 'Sound quality', 'Fast input response', 'lightweight', 'Attractive', 'quality build', 'lacks durability', 'speed', 'craftsmanship', 'camera quality', 'remote quality']
}

### distance based clusters
- we add those phrases to clusters that are within the average intra-cluster distance
- THIS method is MORE efficient, and the terms seems quite nice

In [41]:

def nearest_neighbor_expansion(all_clusters_mean, max_neighbors, phrase_vectors_arr, attributes, nn, qphrase):
    # initialization
    qvec = nlp(qphrase).vector
    neighbor_distances, neighbor_indices = nn.kneighbors([qvec], n_neighbors=10)
    neighbor_indices = neighbor_indices[0]
    neighbor_distances =neighbor_distances[0]
    neighbor_attributes = [attributes[i] for i in neighbor_indices]
    cluster_mean_dist = neighbor_distances.mean()
    #print(cluster_mean_dist, all_clusters_mean)
    #print(neighbor_attributes)

    lc = 1      # increase in number of neighbors (10, 101, 10) to search for nearest neighor expansion
    while cluster_mean_dist < all_clusters_mean and lc < max_neighbors/lc + 1:
        # we need to find more neighbors for this phrase until the mean intra-cluster distance is greater than the mean
        q_idx = neighbor_indices
        #print(len(q_idx))
        qvec = phrase_vectors_arr[q_idx].mean(axis=0)
        #print(qvec)
        distances, neighbors = nn.kneighbors([qvec], n_neighbors=10*lc)
        distances = distances[0]
        neighbors = neighbors[0]
        for knn in range(len(neighbors)):
            #print(knn, neighbors[knn])
            if neighbors[knn] not in neighbor_indices:
                #print('added')
                neighbor_indices = np.append(neighbor_indices, neighbors[knn])
                neighbor_distances = np.append(neighbor_distances, distances[knn])
                neighbor_attributes.append(attributes[neighbors[knn]])
        cluster_mean_dist = neighbor_distances.mean()
        lc += 1

    mean_distance_occurrence_phrases = pd.DataFrame({
        'neighbor_idx': neighbor_indices,
        'neighbor_attr': neighbor_attributes,
        'neighbor_distances': neighbor_distances,
    })
    #print(cluster_mean_dist, all_clusters_mean, lc)
    return mean_distance_occurrence_phrases


In [42]:
qphrases[category]

['Easy to use',
 'Easy to set up',
 'Sound quality',
 'Fast input response',
 'lightweight',
 'Attractive',
 'quality build',
 'lacks durability',
 'speed',
 'craftsmanship',
 'camera quality',
 'remote quality']

In [44]:
qphrase_neighbors = nearest_neighbor_expansion(all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'attractive')
qphrase_neighbors.sort_values('neighbor_distances')

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances
0,28395,attractive,0.0
1,95979,unworkable,3.909154
2,38544,angry,4.002241
3,482,effective,4.076833
4,28346,impressive,4.078627
5,88145,playable,4.116393
6,3280,odoriferous,4.191433
7,71127,evident,4.336577
8,57311,generous,4.381159
9,2913,inadequate,4.40151


In [45]:
qphrase_neighbors['key_phrase_id'] = qphrase_neighbors['neighbor_idx'].apply(lambda _row: key_phrases.iloc[_row]['key_phrase_id'])

In [46]:
qphrase_neighbors.merge(key_phrases, how='left', left_on='key_phrase_id', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
3,482,effective,4.076833,33958,effective,tv,202,182,20,179,136
4,28346,impressive,4.078627,32186,impressive,tv,144,130,14,134,116
0,28395,attractive,0.0,32357,attractive,tv,83,78,5,75,59
8,57311,generous,4.381159,35935,generous,tv,9,6,3,7,8
9,2913,inadequate,4.40151,43624,inadequate,tv,9,5,4,7,8
5,88145,playable,4.116393,46971,playable,tv,4,1,3,4,3
7,71127,evident,4.336577,91627,evident,tv,3,1,2,3,3
1,95979,unworkable,3.909154,78058,unworkable,tv,2,0,2,2,2
2,38544,angry,4.002241,73457,angry,tv,2,1,1,2,2
6,3280,odoriferous,4.191433,45171,odoriferous,tv,2,0,2,1,1


In [59]:
def get_similar_attributes(key_phrases, all_clusters_mean, max_neighbors, phrase_vectors_arr, attributes, nn_model, qphrase):
    qphrase_neighbors = nearest_neighbor_expansion(all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, qphrase)
    qphrase_neighbors['key_phrase_id'] = qphrase_neighbors['neighbor_idx'].apply(lambda _row: key_phrases.iloc[_row]['key_phrase_id'])
    qattr_nn = qphrase_neighbors.merge(key_phrases, how='left', left_on='key_phrase_id', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)
    qattr_nn['qphrase'] = qphrase
    return qattr_nn 

In [60]:
get_similar_attributes(key_phrases, all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'attractive')

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
3,482,effective,4.076833,33958,effective,tv,202,182,20,179,136,attractive
4,28346,impressive,4.078627,32186,impressive,tv,144,130,14,134,116,attractive
0,28395,attractive,0.0,32357,attractive,tv,83,78,5,75,59,attractive
8,57311,generous,4.381159,35935,generous,tv,9,6,3,7,8,attractive
9,2913,inadequate,4.40151,43624,inadequate,tv,9,5,4,7,8,attractive
5,88145,playable,4.116393,46971,playable,tv,4,1,3,4,3,attractive
7,71127,evident,4.336577,91627,evident,tv,3,1,2,3,3,attractive
1,95979,unworkable,3.909154,78058,unworkable,tv,2,0,2,2,2,attractive
2,38544,angry,4.002241,73457,angry,tv,2,1,1,2,2,attractive
6,3280,odoriferous,4.191433,45171,odoriferous,tv,2,0,2,1,1,attractive


# Select curated terms
3 step process
1. pick those phrases that generate better looking terms; note the top ranking threshold for each phrase if needed
2. pick good looking ones from priority_order list; not the top ranking threshold for each phrase if needed
3. save the manually curated phrases and their respective top similar phrases into db



### curating pre-selected attributes

In [69]:
curated_attributes = pd.DataFrame()

In [48]:
category, qphrases[category]

('tv',
 ['Easy to use',
  'Easy to set up',
  'Sound quality',
  'Fast input response',
  'lightweight',
  'Attractive',
  'quality build',
  'lacks durability',
  'speed',
  'craftsmanship',
  'camera quality',
  'remote quality'])

### curating priority ordered phrases

In [None]:
key_phrases.loc[priority_order[:25]]

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
28293,31984,stars,tv,47669,43195,4474,36863,3845
56331,32024,great,tv,20756,19392,1364,18351,3140
84393,31997,easy,tv,11644,11200,444,10177,1915
84411,32068,good,tv,11475,9342,2133,10165,2580
2,31980,price,tv,6881,6193,688,6213,1745
56335,32049,works,tv,5247,4800,447,4796,1570
19,32059,product,tv,5174,3559,1615,4591,1741
56327,32006,wall,tv,4873,4060,813,4148,835
56679,33349,remote,tv,4305,3222,1083,4135,981
84415,32085,perfect,tv,4593,4503,90,4094,1525


### running the manual curation

In [226]:
qattr_nn = get_similar_attributes(key_phrases, all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'good cable')
qattr_nn

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
66,56323,good product,2.119953,31994,good product,tv,2436,2277,159,2130,1037,good cable
87,56949,good picture,2.191208,34426,good picture,tv,491,408,83,475,252,good cable
14,56918,great cable,2.006003,34312,great cable,tv,334,332,2,332,96,good cable
0,485,good cable,1.192093e-07,33973,good cable,tv,227,219,8,223,95,good cable
21,28513,good reception,2.006405,32879,good reception,tv,211,170,41,203,98,good cable
107,49,good job,2.22274,32196,good job,tv,195,180,15,179,157,good cable
7,84921,nice cable,2.179478,34176,nice cable,tv,138,135,3,136,58,good cable
108,10,good device,2.228948,32018,good device,tv,75,63,12,69,55,good cable
27,6417,good color,2.113961,57620,good color,tv,50,45,5,48,46,good cable
18,85203,good construction,2.063715,35319,good construction,tv,42,42,0,37,35,good cable


In [197]:
#qattr_nn.sort_values('neighbor_distances')
qattr_nn[qattr_nn['n_reviews'] >=3] 

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
2,31,great price,1.854035,32129,great price,tv,2140,2097,43,1930,828,good price
0,56386,good price,0.0,32237,good price,tv,1190,1123,67,1079,624,good price
13,1222,low price,1.888605,36780,low price,tv,219,196,23,190,138,good price
14,585,reasonable price,1.92073,34346,reasonable price,tv,211,206,5,196,150,good price
51,84513,excellent price,2.357419,32465,excellent price,tv,162,158,4,150,124,good price
33,85892,fair price,2.131171,38231,fair price,tv,117,114,3,107,100,good price
36,57085,cheap price,2.154353,34986,cheap price,tv,75,64,11,68,60,good price
28,30271,decent price,2.0988,39811,decent price,tv,68,60,8,59,55,good price
1,85318,nice price,1.813472,35833,nice price,tv,65,62,3,64,55,good price
5,84959,fantastic price,2.132827,34313,fantastic price,tv,42,42,0,39,35,good price


In [227]:
curated_attributes = pd.concat([curated_attributes, qattr_nn])

In [228]:
curated_attributes.shape

(736, 12)

In [229]:
curated_attributes.groupby('qphrase')['qphrase'].count()

qphrase
attractive            5
camera quality       32
easy setup           38
easy use             22
good antenna        100
good cable          112
good price           38
good quality        103
good sound          100
is durable           10
light Weight         10
quality build        21
remote operation    101
sound quality        34
works well           10
Name: qphrase, dtype: int64

## storing the manually curated terms in the database

# SCratch

In [268]:
for qphrase in monitor:
    qvec = nlp(qphrase).vector
    _, __n = nn.kneighbors([qvec], n_neighbors=10)
    print(__n)
    print(qphrase)
    print( [attributes[i] for i in __n[0]])

[[12020  7589 18877 21085 17309 23563 19171 19177 21558 28605]]
Easy to set up
['awesome built-in surround sound', 'go-to monitor', 'excellent widescreen hp monitor', 'menu buttons faulty', 'nice freestanding stand', 'lg support unable', 'excellent built-in webcam', 'cost effective', 'expensive af', 'ok works good']
[[ 3756 21111 10550  9048 21219 30179 20828 18312 12656 13617]]
Quality display
['great picture display', 'gtx980 strix', 'quality display', 'great quality display', 'computer display', 'resolution display', 'nice quality display', 'great screen resolution', 'wide screen display', 'color display']
[[24138 14376 28118 27779 11638  3104 21910 10059 22017  3669]]
Good color quality
['great color definition', 'poor color quality', 'poor color accuracy', 'great video quality', 'nice video quality', 'excellent color quality', 'nice color definition', 'great color production', 'poor color rendition', 'good video quality']
[[11460 13956 11194  3093 16329 18830 11510 13618 19582 165

In [278]:
qvec = nlp('durability').vector
_, __n = nn.kneighbors([qvec], n_neighbors=10)
print(__n)
print(qphrase)
print( [attributes[i] for i in __n[0]])

[[ 1799  5790   132  1164 26156 27804  1122 30006 23119  2687]]
Fast
['durability', 'violation', 'flexibility', 'portability', 'malfunction', 'continuity', 'device', 'cage', 'coloration', 'configuration']
