# In this notebook

We extend the latter half of notebook `16` for all 4 categories
- as in notebook `16`
    - we will derive a priority order of attributes
    - we will find similar attributes using nearest neighbor search
- in this notebook
    - we will investigate what the similar phrases look like for each of our categories
    - we will compute pos and neg sub-clusters 
        - we will estimate requisite probabilities
    - we will store the top few similar meaning phrases in the database to serve out using the API
    

In [1]:
import pandas as pd
import json

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 1000

In [2]:
from sqlalchemy import create_engine
import psycopg2 
import io

In [3]:
import os
import glob

In [4]:
import pickle

In [5]:
import numpy as np

# Load attributes from database

In [6]:
conn_string = 'postgresql+psycopg2://gabbydbuser:gabbyDBpass@localhost:5432/gabbyDB'

In [7]:
db = create_engine(conn_string)
conn = db.connect()

### Getting key_phrases for product category

In [252]:
category='laptop'

In [253]:
key_phrases_query = \
    f'''
    SELECT KP.*, KS.n_reviews, KS.n_positive, KS.n_negative, KS.n_reviewers, KS.n_products
    FROM key_phrase_root KP, key_phrase_scores KS
    WHERE KP.key_phrase_id=KS.key_phrase_id
        and KP.category='{category}'
    '''
key_phrases = pd.read_sql(key_phrases_query, conn)

In [254]:
key_phrases.shape

(122630, 8)

In [255]:
key_phrases.head()

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,144553,security slot,laptop,7,4,3,7,6
1,144554,stable object,laptop,1,1,0,1,1
2,144558,computer equal,laptop,1,1,0,1,1
3,144562,slot,laptop,105,77,28,96,92
4,144565,bag,laptop,5430,4233,1197,4860,1280


# Build a nearest neighbor model for phrases

In [256]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [257]:
from sklearn.neighbors import NearestNeighbors

In [258]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [259]:
# get spacy vectors for attributes

phrase_vectors = []

key_phrases['phrase'].progress_apply(lambda p: phrase_vectors.append(nlp(p).vector))

  0%|          | 0/122630 [00:00<?, ?it/s]

0         None
1         None
2         None
3         None
4         None
          ... 
122625    None
122626    None
122627    None
122628    None
122629    None
Name: phrase, Length: 122630, dtype: object

In [260]:
phrase_vectors_arr = np.vstack(phrase_vectors)

In [261]:
phrase_vectors_arr.shape

(122630, 96)

In [262]:
nn_model = NearestNeighbors()
nn_model.fit(phrase_vectors_arr)

In [263]:
q = nlp('Easy to set up').vector

In [264]:
nn_model.kneighbors([q], n_neighbors=10, return_distance=False)

array([[ 89841,  33468,  77052,  61944, 120615,  55651,  81724,  27307,
         57891,  89597]])

### checking out the NN model

In [265]:
def get_nearest_attributes(attribute, k=5):
    q = nlp(attribute).vector
    neighbors = nn_model.kneighbors([q], n_neighbors=k, return_distance=False)
    return neighbors[0]

In [266]:
qterms = ['Easy to set up', 'Quality display', 
            'Good color quality', 'Quality build', 
            'Sound quality', 
            'Minimal glare', 'Lightweight', 
            'Good viewing angles', 'Fast']

In [267]:
for q in qterms:
    print(q)
    neighbors =  get_nearest_attributes(q, 10)
    display(key_phrases.iloc[neighbors])
    

Easy to set up


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
89841,258146,macbook dust free,laptop,1,1,0,1,1
33468,156260,flap down--ideal,laptop,2,2,0,1,1
77052,207072,gb wd black,laptop,1,1,0,1,1
61944,146664,lb gaming laptop,laptop,1,1,0,1,1
120615,259154,hp laptop worth,laptop,1,0,1,1,1
55651,243902,onboard tpm chip,laptop,1,1,0,1,1
81724,225672,go-to laptop bag,laptop,1,1,0,1,1
27307,254036,excellent cherry mx blue,laptop,1,1,0,1,1
57891,252983,hp pavilion dv6 perfect,laptop,1,1,0,1,1
89597,257117,gb built-in storage,laptop,1,0,1,1,1


Quality display


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
121821,263986,large capacity pack,laptop,1,1,0,1,1
10931,188382,g930 headset,laptop,1,1,0,1,1
84569,236854,large screen display,laptop,1,1,0,1,1
122580,266995,great connection speed,laptop,1,1,0,1,1
47420,211257,quality display,laptop,2,2,0,2,2
65209,159547,computer display,laptop,1,1,0,1,1
12301,193846,large capacity drive,laptop,1,1,0,1,1
49945,221176,great performance increase,laptop,1,1,0,1,1
119107,253302,great performance value,laptop,1,1,0,1,1
91170,263584,great value pack,laptop,1,1,0,1,1


Good color quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
72415,188191,good color quality,laptop,1,1,0,1,1
35255,163266,great video quality,laptop,28,26,2,28,5
41533,188240,nice video quality,laptop,3,3,0,3,2
78765,213838,great product quality,laptop,1,1,0,1,1
70598,181227,terrible product quality,laptop,1,0,1,1,1
107447,206389,great color screen,laptop,1,1,0,1,1
97255,165438,good video quality,laptop,10,9,1,10,4
59394,259020,good laptop quality,laptop,1,1,0,1,1
68592,173036,poor color selection,laptop,1,1,0,1,1
70065,179073,poor video quality,laptop,3,0,3,3,3


Quality build


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
92911,147903,quality build,laptop,20,18,2,20,18
35492,164219,good quality build,laptop,13,13,0,13,13
84211,235413,computer build,laptop,1,1,0,1,1
17413,214375,converter build,laptop,1,0,1,1,1
57390,250835,quailty build,laptop,1,1,0,1,1
52954,233084,device build,laptop,1,0,1,1,1
56108,245750,poor quality build,laptop,2,0,2,2,2
105640,199146,new computer build,laptop,1,1,0,1,1
76255,203946,great value buy,laptop,1,1,0,1,1
25313,245962,amazing quality build,laptop,1,1,0,1,1


Sound quality


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
44432,199577,terrible sound quality,laptop,4,1,3,4,3
31033,146229,great sound quality,laptop,73,69,4,72,52
101885,184065,aceptable quality,laptop,1,0,1,1,1
5658,167062,plastic quality,laptop,1,1,0,1,1
45368,203212,geate quality,laptop,1,1,0,1,1
22358,234264,great quality laptop,laptop,6,6,0,6,6
19915,224375,great quality material,laptop,4,4,0,4,4
35486,164199,decent sound quality,laptop,16,15,1,16,15
121805,263937,big quality sound,laptop,1,1,0,1,1
93287,149398,excellent sound quality,laptop,18,16,2,16,15


Minimal glare


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
2403,154225,confidential work,laptop,1,1,0,1,1
77001,206889,crippling glare,laptop,1,1,0,1,1
50250,222382,minimal impact,laptop,1,0,1,1,1
36681,168804,positive feedback,laptop,4,4,0,4,4
42925,193678,positive experience,laptop,5,4,1,5,5
62620,149453,positive latch,laptop,1,0,1,1,1
105084,196969,upgradable storage,laptop,1,1,0,1,1
101811,183803,great batter replacement,laptop,1,1,0,1,1
81605,225222,great backup laptop,laptop,1,1,0,1,1
30371,266418,minimal work,laptop,1,1,0,1,1


Lightweight


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
52698,232055,m-5y10,laptop,2,2,0,1,2
68432,172377,mush,laptop,2,2,0,2,2
24163,241464,usb wifi,laptop,2,2,0,2,2
69055,174927,slimness,laptop,1,1,0,1,1
97796,167715,liar,laptop,1,0,1,1,1
63157,151455,deployment,laptop,3,3,0,3,3
33202,155098,acpro2000,laptop,6,6,0,3,1
70348,180274,150mbps,laptop,1,0,1,1,1
8178,177350,case wich,laptop,1,1,0,1,1
48743,216386,usb microsphone,laptop,1,0,1,1,1


Good viewing angles


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
44473,199780,good viewing angles,laptop,13,13,0,12,12
12619,195079,great viewing angles,laptop,3,3,0,3,3
78593,213122,terrible viewing angles,laptop,2,1,1,2,2
99030,172625,poor viewing angles,laptop,8,4,4,8,8
83241,231544,disappointing viewing angles,laptop,1,0,1,1,1
110267,217987,amazing viewing angles,laptop,1,0,1,1,1
53640,235808,horizontal viewing angles,laptop,1,1,0,1,1
102349,185915,vertical viewing angles,laptop,1,1,0,1,1
55136,241795,wide viewing angles,laptop,1,1,0,1,1
93576,150593,great looking laptops,laptop,2,2,0,1,2


Fast


Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
2298,153814,dual,laptop,18,12,6,18,18
26303,250025,fast clear,laptop,1,1,0,1,1
114947,237057,convenient fast,laptop,1,1,0,1,1
62015,146990,fast,laptop,1201,1127,74,1141,787
119664,255461,separate upper,laptop,1,1,0,1,1
112908,228678,early present,laptop,1,1,0,1,1
5307,165656,fast new,laptop,1,1,0,1,1
61069,265575,steady loud,laptop,1,1,0,1,1
98752,171534,genuine,laptop,18,12,6,17,17
114192,233940,great separate,laptop,1,1,0,1,1


# Trying with an unsupervised clustering model

NOTE: we can't use generic unsupervised clustering methods because we don't know how many clusters can be there. Instead we do the following


1. we first find a 5 or 10 nearest neighbors for each attribute
    - we then group/merge attributes together into clusters based on 
        - mean distance between neighbors across all neighbor sets
        
        
2.  we can cluster together, but also try to differentiate between positive and negative attribute clusters within the same semantic subspace
    - e.g. q = "Good color quality"
        - positive equivalent in subspace = "excellent color quality"
        - negative equivalent in subspace = "poor color quality"
    - OR:
        - THIS [just looking at counts now] --> within each cluster, we subgroup based on probability of occurrence in positive or negative reviews

## find nearest neighbors for each phrase

In [268]:
# keeping a list of attributes
attributes = key_phrases['phrase'].tolist()
key_phase_idx = key_phrases.index

In [269]:
key_phrases

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
0,144553,security slot,laptop,7,4,3,7,6
1,144554,stable object,laptop,1,1,0,1,1
2,144558,computer equal,laptop,1,1,0,1,1
3,144562,slot,laptop,105,77,28,96,92
4,144565,bag,laptop,5430,4233,1197,4860,1280
...,...,...,...,...,...,...,...,...
122625,267162,steelseries flux headphones,laptop,1,0,1,1,1
122626,267165,normal operational temperature,laptop,1,0,1,1,1
122627,267166,good containment,laptop,1,1,0,1,1
122628,267168,big crease,laptop,1,0,1,1,1


In [270]:
import random

In [271]:
# for each phrase, for it's nearest neighbors, let's comput the inter-neighbor distance
# this is taking too long. We'll use a sample of 10000 to compute mean distance

SAMPLE_SIZE = 10000

all_clusters_mean_distance = 0 

for i in tqdm(random.sample(range(len(attributes)), SAMPLE_SIZE)):
    qvec = phrase_vectors_arr[i]
    distances, neighbors = nn_model.kneighbors([qvec], n_neighbors=10)
    all_clusters_mean_distance += distances[0].mean()

all_clusters_mean_distance /= SAMPLE_SIZE

all_clusters_mean_distance

  0%|          | 0/10000 [00:00<?, ?it/s]

2.9091489089727403

## priority order for cluster generation
- to generate clusters, we can't really go first come first serve, or random, because the clustering could then be inconsistent with reality
- we can try to start with the phrases that occur most frequently in reviews, and go down the ranked list imposed by popularity
    - this is also easier to explain

In [272]:
priority_order = key_phrases.sort_values(['n_reviewers', 'n_products'], ascending=False)['key_phrase_id'].index.tolist()

In [273]:
key_phrases.loc[priority_order[:10]]

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
30584,144570,stars,laptop,51760,47282,4478,41423,7137
61479,144571,laptop,laptop,20804,16423,4381,18724,5210
61494,144640,great,laptop,19998,18561,1437,18242,5317
30600,144613,good,laptop,13301,11120,2181,11934,4466
30634,144733,price,laptop,7220,6352,868,6799,2919
9,144592,computer,laptop,6938,4964,1974,6435,2979
92039,144581,nice,laptop,6092,5367,725,5602,2628
92137,144954,perfect,laptop,5898,5754,144,5505,2595
4,144565,bag,laptop,5430,4233,1197,4860,1280
117,145033,battery,laptop,4902,3298,1604,4686,1617


# Phrases close to the given terms

In [274]:
qphrases = {
    'laptop': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast', 'long battery life', 'Noise level'],
    'monitor': ['Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast'],
    'headphone': ['Good sound quality', 'comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'effective noise cancelling', 'Good bass', 'Call quality', 'Attractive'],
    'mouse': ['comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'Attractive', 'Noise level', 'Easy to clean'],
    'tv': ['Easy to use', 'Easy to set up', 'Sound quality', 'Fast input response', 'lightweight', 'Attractive', 'quality build', 'lacks durability', 'speed', 'craftsmanship', 'camera quality', 'remote quality']
}

### distance based clusters
- we add those phrases to clusters that are within the average intra-cluster distance
- THIS method is MORE efficient, and the terms seems quite nice

In [275]:

def nearest_neighbor_expansion(all_clusters_mean, max_neighbors, phrase_vectors_arr, attributes, nn, qphrase):
    # initialization
    qvec = nlp(qphrase).vector
    neighbor_distances, neighbor_indices = nn.kneighbors([qvec], n_neighbors=10)
    neighbor_indices = neighbor_indices[0]
    neighbor_distances =neighbor_distances[0]
    neighbor_attributes = [attributes[i] for i in neighbor_indices]
    cluster_mean_dist = neighbor_distances.mean()
    #print(cluster_mean_dist, all_clusters_mean)
    #print(neighbor_attributes)

    lc = 1      # increase in number of neighbors (10, 101, 10) to search for nearest neighor expansion
    while cluster_mean_dist < all_clusters_mean and lc < max_neighbors/lc + 1:
        # we need to find more neighbors for this phrase until the mean intra-cluster distance is greater than the mean
        q_idx = neighbor_indices
        #print(len(q_idx))
        qvec = phrase_vectors_arr[q_idx].mean(axis=0)
        #print(qvec)
        distances, neighbors = nn.kneighbors([qvec], n_neighbors=10*lc)
        distances = distances[0]
        neighbors = neighbors[0]
        for knn in range(len(neighbors)):
            #print(knn, neighbors[knn])
            if neighbors[knn] not in neighbor_indices:
                #print('added')
                neighbor_indices = np.append(neighbor_indices, neighbors[knn])
                neighbor_distances = np.append(neighbor_distances, distances[knn])
                neighbor_attributes.append(attributes[neighbors[knn]])
        cluster_mean_dist = neighbor_distances.mean()
        lc += 1

    mean_distance_occurrence_phrases = pd.DataFrame({
        'neighbor_idx': neighbor_indices,
        'neighbor_attr': neighbor_attributes,
        'neighbor_distances': neighbor_distances,
    })
    #print(cluster_mean_dist, all_clusters_mean, lc)
    return mean_distance_occurrence_phrases


In [276]:
qphrases[category]

['Easy to set up',
 'Quality display',
 'Good color quality',
 'Quality build',
 'Sound quality',
 'Easy to use',
 'Minimal glare',
 'Lightweight',
 'Good viewing angles',
 'Fast',
 'long battery life',
 'Noise level']

In [277]:
qphrase_neighbors = nearest_neighbor_expansion(all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'attractive')
qphrase_neighbors.sort_values('neighbor_distances')

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances
0,61848,attractive,0.0
1,6453,angry,4.002241
2,16129,arive,4.006609
3,77,effective,4.076833
4,93059,impressive,4.078627
5,12175,imperceptible,4.086038
6,101601,playable,4.116393
7,71696,superstitious,4.210647
8,72831,changeable,4.311512
9,34964,evident,4.336577


In [278]:
qphrase_neighbors['key_phrase_id'] = qphrase_neighbors['neighbor_idx'].apply(lambda _row: key_phrases.iloc[_row]['key_phrase_id'])

In [279]:
qphrase_neighbors.merge(key_phrases, how='left', left_on='key_phrase_id', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
3,77,effective,4.076833,144869,effective,laptop,230,210,20,222,154
0,61848,attractive,0.0,146276,attractive,laptop,228,206,22,216,194
4,93059,impressive,4.078627,148453,impressive,laptop,90,76,14,88,86
6,101601,playable,4.116393,183000,playable,laptop,8,8,0,7,8
9,34964,evident,4.336577,162144,evident,laptop,7,5,2,7,7
1,6453,angry,4.002241,170376,angry,laptop,4,1,3,4,4
8,72831,changeable,4.311512,189947,changeable,laptop,2,2,0,2,2
2,16129,arive,4.006609,209153,arive,laptop,1,1,0,1,1
5,12175,imperceptible,4.086038,193282,imperceptible,laptop,1,1,0,1,1
7,71696,superstitious,4.210647,185404,superstitious,laptop,1,1,0,1,1


In [280]:
def get_similar_attributes(key_phrases, all_clusters_mean, max_neighbors, phrase_vectors_arr, attributes, nn_model, qphrase):
    qphrase_neighbors = nearest_neighbor_expansion(all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, qphrase)
    qphrase_neighbors['key_phrase_id'] = qphrase_neighbors['neighbor_idx'].apply(lambda _row: key_phrases.iloc[_row]['key_phrase_id'])
    qattr_nn = qphrase_neighbors.merge(key_phrases, how='left', left_on='key_phrase_id', right_on='key_phrase_id').sort_values('n_reviews', ascending=False)
    qattr_nn['qphrase'] = qphrase
    return qattr_nn 

In [281]:
get_similar_attributes(key_phrases, all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'attractive')

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
3,77,effective,4.076833,144869,effective,laptop,230,210,20,222,154,attractive
0,61848,attractive,0.0,146276,attractive,laptop,228,206,22,216,194,attractive
4,93059,impressive,4.078627,148453,impressive,laptop,90,76,14,88,86,attractive
6,101601,playable,4.116393,183000,playable,laptop,8,8,0,7,8,attractive
9,34964,evident,4.336577,162144,evident,laptop,7,5,2,7,7,attractive
1,6453,angry,4.002241,170376,angry,laptop,4,1,3,4,4,attractive
8,72831,changeable,4.311512,189947,changeable,laptop,2,2,0,2,2,attractive
2,16129,arive,4.006609,209153,arive,laptop,1,1,0,1,1,attractive
5,12175,imperceptible,4.086038,193282,imperceptible,laptop,1,1,0,1,1,attractive
7,71696,superstitious,4.210647,185404,superstitious,laptop,1,1,0,1,1,attractive


# Select curated terms
3 step process
1. pick those phrases that generate better looking terms; note the top ranking threshold for each phrase if needed
2. pick good looking ones from priority_order list; not the top ranking threshold for each phrase if needed
3. save the manually curated phrases and their respective top similar phrases into db



### curating pre-selected attributes

In [282]:
curated_attributes = pd.DataFrame()

In [283]:
category, qphrases[category]

('laptop',
 ['Easy to set up',
  'Quality display',
  'Good color quality',
  'Quality build',
  'Sound quality',
  'Easy to use',
  'Minimal glare',
  'Lightweight',
  'Good viewing angles',
  'Fast',
  'long battery life',
  'Noise level'])

### curating priority ordered phrases

In [373]:
key_phrases.loc[priority_order[:25]] 

Unnamed: 0,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products
30584,144570,stars,laptop,51760,47282,4478,41423,7137
61479,144571,laptop,laptop,20804,16423,4381,18724,5210
61494,144640,great,laptop,19998,18561,1437,18242,5317
30600,144613,good,laptop,13301,11120,2181,11934,4466
30634,144733,price,laptop,7220,6352,868,6799,2919
9,144592,computer,laptop,6938,4964,1974,6435,2979
92039,144581,nice,laptop,6092,5367,725,5602,2628
92137,144954,perfect,laptop,5898,5754,144,5505,2595
4,144565,bag,laptop,5430,4233,1197,4860,1280
117,145033,battery,laptop,4902,3298,1604,4686,1617


### running the manual curation

In [426]:
qattr_nn = get_similar_attributes(key_phrases, all_clusters_mean_distance, 100, phrase_vectors_arr, attributes, nn_model, 'good product')
qattr_nn

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
3,10,great product,1.907331,144595,great product,laptop,3980,3904,76,3558,1959,good product
0,5,good product,0.0,144569,good product,laptop,2283,2151,132,2014,1377,good product
4,84,nice product,1.923857,144888,nice product,laptop,442,417,25,406,367,good product
95,61676,good job,2.170425,145493,good job,laptop,180,162,18,173,162,good product
15,63234,good performance,1.907398,151775,good performance,laptop,111,101,10,108,107,good product
41,92341,good machine,2.079355,145716,good machine,laptop,74,58,16,72,71,good product
91,92351,good memory,2.155735,145770,good memory,laptop,72,70,2,72,36,good product
59,4691,good cable,2.062286,163229,good cable,laptop,69,66,3,69,43,good product
58,94403,good picture,2.037078,153823,good picture,laptop,56,49,7,56,24,good product
62,31126,good experience,2.096628,146622,good experience,laptop,49,41,8,45,49,good product


In [401]:
#qattr_nn.sort_values('neighbor_distances')
qattr_nn[qattr_nn['n_reviews'] > 5]

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
0,92149,sound quality,0.0,145005,sound quality,laptop,597,418,179,584,237,sound quality
39,1948,sound card,2.591959,152305,sound card,laptop,40,24,16,38,30,sound quality
46,34569,sound system,2.631195,160656,sound system,laptop,30,22,8,30,30,sound quality
100,63622,laptop area,2.57862,153278,laptop area,laptop,25,17,8,24,23,sound quality
52,67878,sound bar,2.683086,170219,sound bar,laptop,25,21,4,24,19,sound quality
123,68895,good budget laptop,2.537901,174311,good budget laptop,laptop,13,10,3,13,13,sound quality
22,33722,sound volume,2.226938,157258,sound volume,laptop,9,7,2,6,7,sound quality
16,92767,sound reproduction,2.327458,147387,sound reproduction,laptop,9,6,3,9,7,sound quality
79,77937,excelent laptop,2.586962,210579,excelent laptop,laptop,8,7,1,8,8,sound quality
54,93372,laptop space,2.644377,149748,laptop space,laptop,8,8,0,7,7,sound quality


In [423]:
curated_attributes = pd.concat([curated_attributes, qattr_nn.head(3)])

In [424]:
curated_attributes.shape

(1073, 12)

In [425]:
curated_attributes.groupby('qphrase')['qphrase'].count()

qphrase
easy setup            38
easy use             102
fast working          10
good angle           128
good camera            3
good cooling           7
good screen           56
good sound           102
good speakers        101
great keyboard       102
great laptop         104
great price          111
light Weight          10
long battery life    101
minimal glare         10
screen quality        35
sound quality         53
Name: qphrase, dtype: int64

## storing the manually curated terms in the database

In [427]:
curated_attributes.head()

Unnamed: 0,neighbor_idx,neighbor_attr,neighbor_distances,key_phrase_id,phrase,category,n_reviews,n_positive,n_negative,n_reviewers,n_products,qphrase
0,95823,easy setup,0.0,159674,easy setup,laptop,88,86,2,87,59,easy setup
26,99876,easy transport,2.130146,176161,easy transport,laptop,2,1,1,2,2,easy setup
27,40288,easy transaction,2.133881,183187,easy transaction,laptop,11,11,0,10,11,easy setup
28,70770,easy storage,2.20201,181876,easy storage,laptop,4,3,1,4,4,easy setup
36,6961,easy repair,2.251794,172588,easy repair,laptop,5,5,0,5,5,easy setup


In [428]:
curated_attributes.to_sql('shortlisted_attributes', con=conn, method='multi',
                            index=False, if_exists='append')

1073

In [429]:
# API contenders
curated_attributes.sort_values('neighbor_distances').sort_values('n_reviews', ascending=False).groupby('qphrase')['phrase'].apply(list).reset_index()

Unnamed: 0,qphrase,phrase
0,easy setup,"[easy installation, easy access, easy setup, good construction, easy replacement, great video quality, easy transaction, easy application, great construction, excellent video, easy process, easy c..."
1,easy use,"[light use, heavy use, normal use, regular use, personal use, constant use, good use, moderate use, continuous use, occasional use, easy use, casual use, great use, basic use, minimal use, actual ..."
2,fast working,"[fast charging, fast typing, fast loading, fast scrolling, fast charging port, fast clicking, fast gaming, fast bshipping, fast starting, fast handling]"
3,good angle,"[good computer, good sleeve, good design, good service, good unit, good investment, protective sleeve, good company, good display, good video, good system, good angle, nice angle, perfect angle, g..."
4,good camera,"[great camera, good camera, nice camera]"
5,good cooling,"[heavy gaming, good lighting, great cooling, basic computing, great gaming, good cushioning, good cooling]"
6,good screen,"[great screen, blue screen, large screen, big screen, nice screen, glossy screen, good screen, black screen, new screen, beautiful screen, bright screen, small screen, original screen, excellent s..."
7,good sound,"[great sound, good sound, good sound quality, nice sound, great sound quality, decent sound, good charge, excellent sound, great laptop sleeve, big sound, good quality sound, nice laptop sleeve, b..."
8,good speakers,"[great speakers, good speakers, nice speakers, small speakers, external speakers, internal speakers, decent speakers, cheap speakers, inexpensive speakers, awesome speakers, terrible speakers, old..."
9,great keyboard,"[great keyboard, good keyboard, nice keyboard, external keyboard, original keyboard, new keyboard, old keyboard, excellent keyboard, great replacement keyboard, small keyboard, bad keyboard, mecha..."


# SCratch

In [268]:
for qphrase in monitor:
    qvec = nlp(qphrase).vector
    _, __n = nn.kneighbors([qvec], n_neighbors=10)
    print(__n)
    print(qphrase)
    print( [attributes[i] for i in __n[0]])

[[12020  7589 18877 21085 17309 23563 19171 19177 21558 28605]]
Easy to set up
['awesome built-in surround sound', 'go-to monitor', 'excellent widescreen hp monitor', 'menu buttons faulty', 'nice freestanding stand', 'lg support unable', 'excellent built-in webcam', 'cost effective', 'expensive af', 'ok works good']
[[ 3756 21111 10550  9048 21219 30179 20828 18312 12656 13617]]
Quality display
['great picture display', 'gtx980 strix', 'quality display', 'great quality display', 'computer display', 'resolution display', 'nice quality display', 'great screen resolution', 'wide screen display', 'color display']
[[24138 14376 28118 27779 11638  3104 21910 10059 22017  3669]]
Good color quality
['great color definition', 'poor color quality', 'poor color accuracy', 'great video quality', 'nice video quality', 'excellent color quality', 'nice color definition', 'great color production', 'poor color rendition', 'good video quality']
[[11460 13956 11194  3093 16329 18830 11510 13618 19582 165

In [278]:
qvec = nlp('durability').vector
_, __n = nn.kneighbors([qvec], n_neighbors=10)
print(__n)
print(qphrase)
print( [attributes[i] for i in __n[0]])

[[ 1799  5790   132  1164 26156 27804  1122 30006 23119  2687]]
Fast
['durability', 'violation', 'flexibility', 'portability', 'malfunction', 'continuity', 'device', 'cage', 'coloration', 'configuration']
