# In this notebook

- we try to find the close attributes for a given set of attributes 
- we try to cluster attributes 
    - NOTE: we can't use unsupervised clustering because we don't know how many clusters can be there 
    - we first find a 5 or 10 nearest neighbors for each attribute
        - we then group/merge attributes together into clusters based on (and/or)
            - THIS IS GOOD --> mean distance between neighbors across all neighbor sets
            - occurrence frequency of nearest neighbors
- let's get the phrases close to the chosen terms

In [100]:
import pandas as pd
import json

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 1000

In [101]:
from sqlalchemy import create_engine
import psycopg2 
import io

In [102]:
import os
import glob

In [103]:
import pickle

In [104]:
import numpy as np

# Load reviews from database

In [127]:
from sqlalchemy import create_engine
import psycopg2 
import io


In [133]:
import pandas as pd
import json

In [134]:
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 1000

In [135]:
conn_string = 'postgresql+psycopg2://gabbydbuser:gabbyDBpass@localhost:5432/gabbyDB'

In [136]:
db = create_engine(conn_string)
conn = db.connect()

### Getting data for product category

In [137]:
monitor_reviews_query = \
    '''SELECT BR.*
        FROM baseline_reviews BR,  
        (SELECT asin
        FROM baseline_products 
        WHERE title ILIKE '%%inch%%' 
        AND title ILIKE '%%monitor%%') AS BP 
        WHERE BR.asin = BP.asin; '''
monitor_reviews = pd.read_sql(monitor_reviews_query, conn)

In [138]:
monitor_reviews.shape

(45480, 10)

# Load intermediate save files

In [105]:
save_files = sorted(glob.glob("*pkl"))

In [106]:
save_files

['cumulative_subset-000.pkl',
 'cumulative_subset-001.pkl',
 'cumulative_subset-002.pkl',
 'cumulative_subset-003.pkl',
 'cumulative_subset-004.pkl',
 'cumulative_subset-005.pkl',
 'cumulative_subset-006.pkl',
 'cumulative_subset-007.pkl',
 'cumulative_subset-008.pkl',
 'cumulative_subset-009.pkl',
 'cumulative_subset-010.pkl',
 'cumulative_subset-011.pkl',
 'cumulative_subset-012.pkl',
 'cumulative_subset-013.pkl',
 'cumulative_subset-014.pkl',
 'cumulative_subset-015.pkl',
 'cumulative_subset-016.pkl',
 'cumulative_subset-017.pkl',
 'cumulative_subset-018.pkl',
 'cumulative_subset-019.pkl',
 'cumulative_subset-020.pkl',
 'cumulative_subset-021.pkl',
 'cumulative_subset-022.pkl',
 'cumulative_subset-023.pkl',
 'cumulative_subset-024.pkl',
 'cumulative_subset-025.pkl',
 'cumulative_subset-026.pkl',
 'cumulative_subset-027.pkl',
 'cumulative_subset-028.pkl',
 'cumulative_subset-029.pkl',
 'cumulative_subset-030.pkl',
 'cumulative_subset-031.pkl',
 'cumulative_subset-032.pkl',
 'cumulati

In [107]:
# intermediate save files
key_phrases = pd.read_pickle(save_files[-1])
    

In [108]:
key_phrases.shape

(30231, 5)

## Set up the data frame

In [109]:
key_phrases['reviews'].apply(lambda r: len(r)).sum()

146199

In [139]:
key_phrases['category'] = 'Monitor'
key_phrases['n_reviewers'] = key_phrases['reviewers'].apply(lambda r: len(r))
key_phrases['n_reviews'] = key_phrases['reviews'].apply(lambda r: len(r))
key_phrases['reviewer_idf'] = np.log(monitor_reviews.shape[0]/key_phrases['n_reviewers'])



In [111]:
key_phrases['key_phrase_id'] = list(range(key_phrases.shape[0]))
key_phrases = key_phrases.reset_index().rename(columns={'index': 'phrase'})[[ 
    'key_phrase_id', 'phrase', 'reviews', 'reviewers', 'products', 'n_positive', 'n_negative', 'category', 'reviewer_idf'
]]

In [112]:
key_phrases.head()

Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf
0,0,game room,"{159566, 445310}","{A3EY8AM4N0KJ5B, A1RKQ26P6NOYLV}","{B001DKMO0A, B00877ZOYK}",2,0,Monitor,10.031881
1,1,easy,"{753664, 753665, 1118210, 1189890, 985092, 905221, 985094, 669705, 905226, 628747, 530445, 985110, 753687, 333848, 595992, 974874, 905243, 675865, 596001, 518178, 985125, 985127, 596009, 753706, 1...","{A32X6158C8EM49, A2UXCFUHERXKF3, ALYD5UDPU2HW9, A26LD9FQHTM8ZH, A2PSXXHVZPZZEB, A13KTUPQSZFV9V, ANRXH0BVEAR6B, A1EO1TJYLU9K69, A2WFQ41IFYIYO7, AR31PJQ11JVWQ, A1JYEDOC6QW17L, A3KOWZF6GKSHLV, A3634U...","{B007A1G07O, B0072LGY5K, B01BV1Q0L4, B008FC8DFY, B00EVM3JYM, B003Y3BJ7S, B013FEJQPM, B015F9J82Q, B00B5MUJDK, B00JR6GBHO, B001UHOX2I, B00KJM16KM, B00UNR0OOW, B0115ZHH68, B01B9IDLAW, B00F36Y9XU, B00...",997,54,Monitor,3.779014
2,2,good product,"{636928, 1293827, 721411, 736771, 345609, 314387, 685076, 964117, 232472, 964121, 1331228, 964125, 474653, 437797, 662053, 756263, 643112, 168489, 685098, 1129003, 473132, 594477, 685103, 177713, ...","{AWX1PY9ZZK73Z, A9EURDHZUN7ND, A354G0OW8ATW9Z, A1JYGIYL1YHRCL, A1QWD9AMTIEICG, A2AF5LFJ13HRXF, A3H7KTYLUZG2OP, A2MY9MKIQ8YIYK, A3JCGRBXUALKBX, A3KY7727H7G86H, A35HUOBGI4EGTK, A2PZY48K3G9YCU, A6OU4...","{B0085H662G, B005QEUV9K, B005DDBKEO, B01GFG3MCK, B01FXDVZ5W, B003Y73Q4S, B00EIRFYS4, B000M7YZHU, B00HY7PAUC, B00C8T5KOW, B003Y3BJ7S, B00P94RDKM, B01CNJVG8K, B005J7YFLY, B00B17C5KO, B00B5MUJDK, B00...",284,16,Monitor,5.058601
3,3,stars,"{360448, 360451, 360455, 753675, 360475, 360477, 1343518, 753694, 360478, 753697, 360479, 753700, 753702, 753709, 753711, 753717, 753718, 360507, 360508, 753728, 753732, 753739, 360532, 753749, 36...","{A25JUTALO1OXZ4, A3PU4I3Z7PSBIV, A2NNTY0IT1MTGV, A17U6UVJZBJ6IX, A3DT7S1RUNI1G3, A280EBTOBXQAUQ, ABBBZNUHJQ7QT, A2WNV41W2TSJKQ, A3UUMYOYOLFG6W, A1LJW5F04F9EEF, A2E8QQKVK0G3NS, A3CXT750IHV5JJ, A27Z...","{B0085H662G, B00C18YRSU, B007A1G07O, B0072LGY5K, B01GNMBC4G, B01BV1Q0L4, B00ND1KUSA, B00C17FS52, B00C18YT9M, B008FC8DFY, B00EVM3JYM, B003Y3BJ7S, B013FEJQPM, B00P94RDKM, B015F9J82Q, B00B5MUJDK, B00...",6736,509,Monitor,1.904476
4,4,thanks,"{1128961, 1044490, 685073, 1287702, 1320477, 596004, 713766, 406569, 951339, 610860, 1371179, 345646, 643128, 887354, 1177151, 234048, 1306690, 1271881, 826968, 1326168, 485471, 473702, 1223791, 3...","{A2D2VVFXLW5ODE, A3QIQ87YYOBLF, A35UO8G1Y5J0RU, ANOXNGO4P4X5I, A3NEND2USUFLZI, A3H8MG40PBZ779, A30DG3X14Z419O, A3OJO8QR5AFG5W, A38IF5RFM9LBC6, A3UUMYOYOLFG6W, A1B2GTF2UFK1GM, A200NDNPRH2YUI, A29GI...","{B00522X8B4, B00LJVMOC6, B0072LGY5K, B00C17FS52, B0036C7IMM, B006TAOH9K, B003Y3BJ7S, B005J7YFLY, B00B17C5KO, B00JR6GBHO, B00E26IRA8, B00O23HKFO, B00E7LVBCE, B015WCV70W, B01H8EB3EM, B0036ISPSW, B00...",132,6,Monitor,5.842226


In [113]:
key_phrases[key_phrases['phrase'] == 'image']

Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf
871,871,image,"{594946, 1291269, 316935, 428552, 866824, 594955, 178192, 594962, 1047058, 1047060, 1128986, 964122, 1224219, 234019, 265256, 1223725, 889390, 234032, 361008, 747568, 662073, 826940, 437821, 26528...","{A1XYQFW2WY6AWN, A28ISUIFUMLQD6, A2IURNDS90OQ6O, A39JVI19WKKMVX, A1FN3UROHHW18T, A3V08V821QXL10, A3AVXHU880YWQY, AI8EMB0HUC6Z5, A1OFNT4EU546HC, A1ZUSBK1VCDF0F, A3DVKD22BQ7Z2C, A2992B6ABUQCK, A2ZGM...","{B0085H662G, B00P0EQD1Q, B00BJSB790, B00LJVMOC6, B01GFG3MCK, B01FXDVZ5W, B01C3BZIIC, B002DHS398, B00C8T5KOW, B004TPTX0U, B00EVM3JYM, B00DBMIDTO, B003Y3BJ7S, B01CNJVG8K, B00GJK57V2, B00B17C5KO, B00...",156,72,Monitor,5.313382


# Build a nearest neighbor model for phrases

In [114]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [115]:
from sklearn.neighbors import NearestNeighbors

In [116]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [117]:
# get spacy vectors for attributes

phrase_vectors = []

key_phrases['phrase'].progress_apply(lambda p: phrase_vectors.append(nlp(p).vector))

  0%|          | 0/30231 [00:00<?, ?it/s]

0        None
1        None
2        None
3        None
4        None
         ... 
30226    None
30227    None
30228    None
30229    None
30230    None
Name: phrase, Length: 30231, dtype: object

In [118]:
#phrase_vectors[0]

In [119]:
phrase_vectors_arr = np.vstack(phrase_vectors)

In [120]:
phrase_vectors_arr.shape

(30231, 96)

In [121]:
nn = NearestNeighbors()
nn.fit(phrase_vectors_arr)

In [122]:
q = nlp('Easy to set up').vector

In [123]:
nn.kneighbors([q], n_neighbors=5, return_distance=False)

array([[12020,  7589, 18877, 21085, 17309]])

In [124]:
def get_nearest_attributes(attribute, k=5):
    q = nlp(attribute).vector
    neighbors = nn.kneighbors([q], n_neighbors=k, return_distance=False)
    return neighbors[0]

In [125]:
qterms = ['Easy to set up', 'Quality display', 
            'Good color quality', 'Quality build', 
            'Sound quality', 
            'Minimal glare', 'Lightweight', 
            'Good viewing angles', 'Fast']

In [140]:
for q in qterms:
    print(q)
    neighbors =  get_nearest_attributes(q, 10)
    display(key_phrases.iloc[neighbors])
    

Easy to set up


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
12020,12020,awesome built-in surround sound,{525615},{ACR2YV19UF6DL},{B00B17C5KO},1,0,Monitor,10.725028,1,1
7589,7589,go-to monitor,{357139},{A13KPH9NIIX77Z},{B005JN9310},1,0,Monitor,10.725028,1,1
18877,18877,excellent widescreen hp monitor,{776925},{A2YX95ZYWRKZ7Y},{B00JR6GCZA},1,0,Monitor,10.725028,1,1
21085,21085,menu buttons faulty,{890232},{AP6859O19D0AE},{B00O0Z5682},0,1,Monitor,10.725028,1,1
17309,17309,nice freestanding stand,{733626},{A1UB9O698427QA},{B00HUWEXE0},1,0,Monitor,10.725028,1,1
23563,23563,lg support unable,{994577},{A2L28CAIUAT503},{B00V8FAWC2},0,1,Monitor,10.725028,1,1
19171,19171,excellent built-in webcam,{791985},{AGVWTYW0ULXHT},{B00K9NN1VS},1,0,Monitor,10.725028,1,1
19177,19177,cost effective,"{1118329, 791994, 908066}","{A31AP098RWGIS5, A3MHYJPA6VCOW, A3KLXUPCSDCSUN}","{B00K9NN1VS, B015FP25V6, B00PC9HFO8}",3,0,Monitor,9.626416,3,3
21558,21558,expensive af,{902720},{A152B3EPXZ0XRI},{B00P0EQD1Q},1,0,Monitor,10.725028,1,1
28605,28605,ok works good,{1300360},{A1LQ86BUYOULPT},{B003Y73Q4S},0,1,Monitor,10.725028,1,1


Quality display


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
3756,3756,great picture display,{246659},{A19LPHM3EM4V4B},{B003Y3BJ7S},1,0,Monitor,10.725028,1,1
21111,21111,gtx980 strix,{890332},{A2F4S2AH8BWL8Z},{B00O1B5M9I},1,0,Monitor,10.725028,1,1
10550,10550,quality display,"{602055, 594955, 473679, 1128533, 474264, 669406}","{A1OK2HVJ5ZJPE3, AUHUET9V4KP47, A3TKUSZEY953C9, A4LN4OEBX8ECN, AYUJ649P6ERZC, AWZJBY3B84VEP}","{B00EZSUWFG, B0098Y77U0, B015WCV70W, B00CLZ047Q, B00CTODKIO}",5,1,Monitor,8.933268,6,6
9048,9048,great quality display,"{428050, 902595, 1038468, 902654}","{A11TDADLWY05NL, A334OGRMSEH0F8, A1NTFAUBGC8PK8, A3S6HBEM94BPAK}","{B00YD3DBOC, B00P0EOX1S, B007HSKSMI, B00P0EQD1Q}",2,2,Monitor,9.338734,4,4
21219,21219,computer display,"{898313, 908355}","{AFXRO7K3ZZ0PX, A13DIFDALMVDVL}","{B00OL236PY, B00PC9HFO8}",1,1,Monitor,10.031881,2,2
30179,30179,resolution display,{1386819},{AHIXND07NPZS1},{B01FHDL1DE},0,1,Monitor,10.725028,1,1
20828,20828,nice quality display,{889324},{A2HEPRABADETIB},{B00NZTKOQI},1,0,Monitor,10.725028,1,1
18312,18312,great screen resolution,{758045},{A4LXXXH6TJNCV},{B00ITORMNM},1,0,Monitor,10.725028,1,1
12656,12656,wide screen display,{558561},{A1199X0O47CEXP},{B00BNI36WM},1,0,Monitor,10.725028,1,1
13617,13617,color display,"{714432, 889458, 594894}","{A1G9GDHYULWSVJ, A1W3Y69CLFUP7I, A2095AB29SVAKG}","{B00GVE7QEC, B00NZTKOQI, B00CLZ047Q}",2,1,Monitor,9.626416,3,3


Good color quality


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
24138,24138,great color definition,{1033520},{A3CG788F217SCT},{B00Y09G6JG},0,1,Monitor,10.725028,1,1
14376,14376,poor color quality,"{610788, 878334}","{A1O4I35CT37CLP, A23CEMSHC8BGX0}","{B00D601UC8, B00NM76PXY}",0,2,Monitor,10.031881,2,2
28118,28118,poor color accuracy,{1287255},{A2J9NEE5C5D2AU},{B000KI9W74},1,0,Monitor,10.725028,1,1
27779,27779,great video quality,{1271743},{A1M7B5GWGFPO07},{B01GNMBC4G},1,0,Monitor,10.725028,1,1
11638,11638,nice video quality,"{1044491, 519421}","{A3DLWUFMU98LNC, A137TG4ZIO1WM8}","{B00AV4NXFY, B00Z0UX93U}",2,0,Monitor,10.031881,2,2
3104,3104,excellent color quality,{234094},{AHKMCPVQXC26G},{B003FP7OYM},1,0,Monitor,10.725028,1,1
21910,21910,nice color definition,{908399},{ARZD3HH4J23EV},{B00PC9HFO8},1,0,Monitor,10.725028,1,1
10059,10059,great color production,{445350},{A3SFWGLEG3EDAU},{B00877ZOYK},1,0,Monitor,10.725028,1,1
22017,22017,poor color rendition,"{908148, 1128884}","{AEK4J0ZJDA7QG, A2JJCARFFZNA63}","{B015WCV70W, B00PC9HFO8}",0,2,Monitor,10.031881,2,2
3669,3669,good video quality,"{1044425, 323050, 246526, 473879, 379902}","{A1SPHUM1ISSPQ5, A2WB4823XOBRFA, A2PGWNO92A9RIG, AR8LWDT6TUKNZ, A2ORUUKCLEKYMK}","{B00Z0UX93U, B0098Y77U0, B0062FA55M, B005680BX2, B003Y3BJ7S}",4,1,Monitor,9.11559,5,5


Quality build


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
11460,11460,quality build,"{497090, 974802, 1223873, 985014}","{AQGHVBGWW1YYG, A1U7IINCWT3CQF, A3J031UXHGZTWV, A23O9OBCZFH1W}","{B00TR0518U, B009SRSUIK, B01CX26VNC, B00UNR0OOW}",4,0,Monitor,9.338734,4,4
13956,13956,good quality build,{595955},{ACI0RCKFUE17Z},{B00CMKOVMO},1,0,Monitor,10.725028,1,1
11194,11194,new system build,"{488456, 519461}","{A19TDQ9HVYEDJP, A14AXB2EY4MK4X}","{B00AV4NXFY, B009H0XQQY}",2,0,Monitor,10.031881,2,2
3093,3093,pc build,"{234076, 776823}","{AIJFD1W742XV9, AR6Y9PM0M0UC}","{B003FP7OYM, B00JR6GCZA}",2,0,Monitor,10.031881,2,2
16329,16329,mst setting,{713885},{A29GXCZNXW1L63},{B00GTV05XG},1,0,Monitor,10.725028,1,1
18830,18830,gaming build,{776884},{A3RG3S43T1TTH4},{B00JR6GCZA},1,0,Monitor,10.725028,1,1
11510,11510,nice build,{503776},{ARWR2O48OIGKH},{B00A7OZ49G},0,1,Monitor,10.725028,1,1
13618,13618,new desktop build,{594895},{A1FLZXCN4B8UME},{B00CLZ047Q},0,1,Monitor,10.725028,1,1
19582,19582,grandma pc build,{839288},{A9E3PE4NM9QSI},{B00M1C48OE},1,0,Monitor,10.725028,1,1
16523,16523,dust cover,{714661},{A2WQJ696TY4UIQ},{B00GWFNMJS},1,0,Monitor,10.725028,1,1


Sound quality


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
6093,6093,great sound quality,"{881036, 314214}","{A2HTSIGSS5292W, A2ALNHJUOMXC1T}","{B00NNQGHXC, B0051GN8JI}",2,0,Monitor,10.031881,2,2
28686,28686,sharpness quality,{1304018},{A1108290OI2D06},{B004R5SLOG},1,0,Monitor,10.725028,1,1
13207,13207,terrible quality,"{669091, 1386724, 578110}","{A2WB7LZ595CR50, A2DFPYGAO2V2CT, A3FQ8IP12LG8IJ}","{B00EZSUWFG, B01FF8PZA6, B00C8T5KOW}",1,2,Monitor,9.626416,3,3
27779,27779,great video quality,{1271743},{A1M7B5GWGFPO07},{B01GNMBC4G},1,0,Monitor,10.725028,1,1
1016,1016,great picture quality,"{1177601, 437893, 753798, 233992, 713737, 1307912, 669559, 1129490, 474515, 594708, 307862, 890523, 1200034, 858531, 611237, 839335, 472956, 747317, 812474, 997052, 669121, 178630, 847431, 747337,...","{A2J8289OOT5DE4, A39DM4JPIKVO62, A76FCQE9J052A, A3GCUP0G6E5JB, A34TKQQPCALJ5U, A16K7MFBX73W0L, AZQDVTN8OOKJF, A213WDNH89OGCS, A37LIOJSG9H8UV, A1DL1E32MQF2WH, A3HIR3RPRFCEA4, A2VJJ2POCLMED7, AXDQZ3...","{B005DDBKEO, B007SLDF7O, B00EZSUWFG, B00MSOND8C, B00HY7PAUC, B00GTV05XG, B003Y3BJ7S, B00M9B3XN4, B00PC9HFO8, B00KRA5S00, B007M4UUF2, B00O23HKFO, B015WCV70W, B00P0EOX1S, B00VHKIB2G, B001R0MLCM, B00...",48,4,Monitor,6.773784,52,52
5956,5956,good sound quality,"{1044474, 314364}","{A1F6XC3UU6MMZ5, A38CJWQCJGQIRB}","{B0051GN8JI, B00Z0UX93U}",2,0,Monitor,10.031881,2,2
29118,29118,great sound bar,{1323633},{AL95QZZ2WHOSJ},{B00AYA7NBA},1,0,Monitor,10.725028,1,1
14382,14382,bad sound quality,"{610801, 890487}","{A78BD4L1BAZKF, A10C5MNWRL6F5A}","{B00D601UC8, B00O23HKFO}",0,2,Monitor,10.031881,2,2
26221,26221,awesome video quality,{1201188},{API854T6V75RO},{B01BHC7UTI},1,0,Monitor,10.725028,1,1
11895,11895,great screen quality,"{847522, 525420, 747278, 747375}","{AGJTLP0PGJV4N, A3DWEZE8F04GIO, AA26I4RAO2CKX, A2ZPS6L6J8JJES}","{B00IEZGYI0, B00IEZGY2Q, B00B17C5KO, B00M9B3S0W}",4,0,Monitor,9.338734,4,4


Minimal glare


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
6888,6888,minimal glare,{334868},{A2L9EWLUF49S2A},{B005CFLMNC},1,0,Monitor,10.725028,1,1
21712,21712,noticeable impact,{902849},{A3F1PVLDPJ28P1},{B00P0EQD1Q},1,0,Monitor,10.725028,1,1
27166,27166,positive experience,{1229563},{A1OL4EZNR5G13Y},{B01D9T3X3Y},1,0,Monitor,10.725028,1,1
7745,7745,minimal work,{360598},{A3INFMKPU6ZU3F},{B005LJWJSG},1,0,Monitor,10.725028,1,1
28053,28053,specific space,{1285315},{A1XL2VIIO90WFT},{B000BMBUAQ},1,0,Monitor,10.725028,1,1
18931,18931,noticeable strain,{776964},{A23MD057E4XEGN},{B00JR6GBHO},1,0,Monitor,10.725028,1,1
11772,11772,great glossy monitor,{524160},{A23X5ZCBVBX9WP},{B00AZMLIDQ},1,0,Monitor,10.725028,1,1
23786,23786,actual assembly,{1007623},{A11HADRQEIXRZM},{B00W58D4KI},0,1,Monitor,10.725028,1,1
3114,3114,professional work,"{232601, 1235860}","{AIFLY2HF8NS8U, A3VYUILRQ9WS5D}","{B01DPFJ4JO, B003D1CFHY}",2,0,Monitor,10.031881,2,2
29077,29077,magnificent view,{1320482},{AEYFTGZG9RYRB},{B009H0XQRI},1,0,Monitor,10.725028,1,1


Lightweight


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
7689,7689,s220hql,{360322},{A3LOXFGW09MH55},{B005LJWJSG},1,0,Monitor,10.725028,1,1
28035,28035,deployment,{1286171},{A1DLIJ331321TM},{B000FG8KP6},1,0,Monitor,10.725028,1,1
10246,10246,g206hql,{473042},{A95A9XOSHTQKX},{B0098Y77U0},0,1,Monitor,10.725028,1,1
20302,20302,monitory,"{1129187, 878038}","{A3CFMOASPQKM18, A30OG8T7G0FPL8}","{B015WCV70W, B00NM76PXY}",2,0,Monitor,10.031881,2,2
1349,1349,dad,"{572549, 611213, 180109, 611220, 1128860, 1346845, 394016, 1269030, 305580, 628793, 399418, 474559, 669386, 1329359, 758356, 948447, 762602, 438005, 1338493}","{A1WPJHA0XNQ6OY, A2TJZMD0SOCI3B, A22EIED58B1MMB, A1N5FT5HXSCPKF, A3CV2VA0B2ZWD3, A19G3E44NZUDKE, A8QJ0NPQBXCEK, A1ZQTXMVSYLIZO, A39L2TPFYEHNXX, A3N6DYPWCBXMRK, A3TH1WKKMXA0JY, A22R3OHOHT42C3, A1Q1...","{B00DM65EH8, B00C18YQ5Y, B00DBVPOZG, B01GFW9S7C, B00EZSUWFG, B007SLDF7O, B006JY4TQW, B00HEZULRQ, B0098Y77U0, B015WCV70W, B00SFB13UC, B00L3642TU, B001RJU4TA, B00J0ER9K6, B004XNLD8I, B006RV8RKG, B00...",17,2,Monitor,7.780589,19,19
28730,28730,usb male,{1305892},{A3HBY2YN08FXZV},{B0050X2YZQ},0,1,Monitor,10.725028,1,1
10076,10076,motherboard,"{445381, 1380805, 1256197, 643174, 762576, 940991}","{A2T689YVOAYGGD, AUQ1Z9SUPMKUW, A35BW47D42NMZF, A2OQVKCMKLR8SE, A3REZHFYFMMWH2, A1Q4W56KWMSLY2}","{B00RORBPEW, B01FHDL1DE, B00E26IRA8, B00877ZOYK, B00J0ER9K6, B01B9IDL4I}",4,2,Monitor,8.933268,6,6
978,978,lo traje,{178448},{A36J3K929QDJF2},{B001QXDSS6},1,0,Monitor,10.725028,1,1
25743,25743,tripod mound,{1179038},{A2RXPNY4CIADJR},{B019K3T5CQ},1,0,Monitor,10.725028,1,1
3611,3611,refurb,"{595868, 243988, 812486}","{A2QADG2WPXEKOV, A145P764RX7Q6V, A3EVZULC2B71VI}","{B003UT2C4U, B00KYCSRSG, B00CMKOVMO}",0,3,Monitor,9.626416,3,3


Good viewing angles


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
15604,15604,good viewing angles,"{1128943, 789744, 669234, 1336762, 1129342}","{A31LFXM4EXH6AB, A2SGKCZ3863LZE, A3101HTK9BTOF8, A2DV6U3SEAZ70R, A2W9GX82SLKROQ}","{B015WCV70W, B00EZSUWFG, B00K6E8ACU, B00GMG3E1S}",5,0,Monitor,9.11559,5,5
12129,12129,great viewing angles,"{529600, 1204416, 1303236, 659887, 1203823, 747383, 1340664, 1326170}","{A1ACP068WFRJZU, A10X9VFDB4MORV, A303SNZ4AR7TDQ, A1Y9FHOPO2UPDN, A1WJ3P43SZUNDM, A1C8X07A8N0P9W, A3QCK8PCP445PY, AZ515FFZ7I2P7}","{B00C18YQ8Q, B00EMB4KVI, B00IEZGYI0, B01BUZHLBY, B00B332A9C, B01BV1X9DG, B004KM4AQY}",8,0,Monitor,8.645586,8,8
7642,7642,nice viewing angles,{361089},{A3IX3UHYCSN6H1},{B005LJWJSG},1,0,Monitor,10.725028,1,1
10774,10774,terrible viewing angles,"{594465, 474548, 578172, 474439}","{A1PGQHQ7B16F3N, AVQREBUMLU2P2, A29SG7NS2B9HHM, A2K13BULPRZXP5}","{B00CLZ047Q, B0098Y77U0, B00C8T5KOW}",0,4,Monitor,9.338734,4,4
9759,9759,poor viewing angles,"{473376, 898917, 473991, 1350128, 437747, 473908, 643093, 473240, 643196}","{A35ETL6NWKF6JX, A39D65NKRV0RUW, A1LZ31JAHIZU8I, A2HPPS70SHTKQP, AU54DC6T57YRB, A158W41CL9KSNO, A305RNENM2DAVK, A2P6P7JKEH6LWS, A1YDQ2CCRB4OPG}","{B00OOJPAGW, B00E26IRA8, B007SLDF7O, B0098Y77U0, B00MUT6SLE}",3,6,Monitor,8.527803,9,9
10640,10640,abysmal viewing angles,{474147},{A32736C16HKJTK},{B0098Y77U0},1,0,Monitor,10.725028,1,1
20591,20591,superior viewing angles,{878360},{ASFW4ZMNZJKDA},{B00NM76PXY},1,0,Monitor,10.725028,1,1
3088,3088,different viewing angles,"{723092, 232598}","{A2RK04OMCN2HXO, A2DS3K1Z69MGO0}","{B00HALPPM0, B003D1CFHY}",2,0,Monitor,10.031881,2,2
25179,25179,extreme viewing angles,{1128428},{AY6Y7BOK2V43I},{B015WCV70W},1,0,Monitor,10.725028,1,1
14628,14628,decent viewing angles,"{628697, 753441}","{A31FVJHJ0FJO6A, A2LI1G90JQPB0W}","{B00DM65EH8, B00IKDFL4O}",1,1,Monitor,10.031881,2,2


Fast


Unnamed: 0,key_phrase_id,phrase,reviews,reviewers,products,n_positive,n_negative,category,reviewer_idf,n_reviewers,n_reviews
1517,1517,dual,"{1203972, 747269, 530445, 713749, 352150, 1054234, 675870, 577066, 1201849, 1044409, 758209, 758346, 994649, 995676, 530399, 1204576, 552418, 578146, 974818, 578288, 185585}","{ADG3SES3OGVH6, A187A91P19VK7J, A1ADVQ0HX87XG1, A27S9U0N50Q3XY, AAK7M9P1GEYWV, A1GLN276J5654G, A32P7YK0YK4IDA, AHZRL5LZ95PEA, A1Y8PUAB1S3Z0V, A2RJ2NDW1OSEX1, A39HI6BHHM0IV0, AHSTHL3HJ3XAS, A5W5QZH...","{B001UHOX2S, B00VBNQJKU, B00IEZGYI0, B00BJSB790, B00C78QS68, B00TR0570W, B00Z0UX93U, B01BV1X9DG, B00B3YQG4Q, B00GTV05XG, B01BMB3ANE, B01BV1XB2K, B00C8T5KOW, B00ITORMDC, B00V8FAWC2, B005HPSFWI, B01...",19,2,Monitor,7.680506,21,21
1069,1069,fast,"{669057, 572546, 720135, 333709, 428312, 177691, 357019, 1149214, 612895, 488356, 452005, 394536, 756264, 1002154, 1148072, 1280939, 1328301, 1272493, 643119, 602032, 908465, 905266, 1298860, 7538...","{A2OET9J7JFC8EU, AN8ZHUYIXLN8H, A84RO0M8BIIR5, A1VX8O2E41ZXHR, A10KK4JG3UGTJB, A3IVZ50K0KXC4L, A1QEMA87PYWBDB, A2SURI17Z9D3XP, ATQVLZOBO68E9, AV2RJE8ON6SMS, A1AL5T9C08J1CY, A3N9MFESVZOH1M, A3A9BIW...","{B01HI829TG, B00EZSUWFG, B00CU65K7U, B00MSOND8C, B00HY7PAUC, B00GOBD2WW, B00NM76PXY, B00C8T5KOW, B0173PEX34, B005LJWJSG, B00PC9HFO8, B004G8QSTO, B017B80U3U, B003D59FFA, B005J7YFLY, B00E26IRA8, B00...",53,0,Monitor,6.754736,53,53
28313,28313,genuine,{1294941},{A19X58X7MRRE51},{B001UHOX2S},1,0,Monitor,10.725028,1,1
23080,23080,fast inexpensive,{974729},{AM3R8PW5WQ7UB},{B00TR05716},0,1,Monitor,10.725028,1,1
7542,7542,little slow,"{357024, 720217, 720419, 908256}","{AF73Y1H82WC2Z, A16NQ96IJ3XCPG, A3KWQFYMCOZXQM, A13DDGR7UWYESJ}","{B005J7YFLY, B00H4S9J94, B00PC9HFO8}",4,0,Monitor,9.338734,4,4
5597,5597,real,"{740736, 428385, 1297857, 445190, 322988, 897869, 1128784, 1199860, 1308023, 747576, 306300, 1204446}","{A1R22Y40DSQ72L, A19CVE8KBXWBLN, A27181AF6YYMRF, A2URW6A399O6WY, A14G1QO3CDAIF5, A2JDGE652C333T, A1ZCC4FCBOUIFW, A1FMIQSZ9RYXW6, ASVLUXZRRX4JN, A22R8A0PW7LSUA, A12Z6YIRORLS2Q, AUMCI5D3FRAVF}","{B01B9IDLAW, B00OKSEVTY, B007HSKSMI, B00IEZGWI2, B00877ZOYK, B00I5FRGWC, B015WCV70W, B01BV1X9DG, B005DP9QHA, B004Y3FE6O, B005680BX2, B002ZT0784}",8,4,Monitor,8.240121,12,12
21734,21734,macbook pro late,{902880},{A27UW7L8SCDUPD},{B00P0EQD1Q},1,0,Monitor,10.725028,1,1
9577,9577,separate hdmi,{428633},{A2L9I233YA2LDM},{B007ILEHNU},1,0,Monitor,10.725028,1,1
2104,2104,long,"{1300096, 777985, 1039239, 280457, 487945, 406412, 408078, 685203, 907027, 596632, 806174, 213667, 1372453, 530350, 596654, 1078962, 720569, 1048913, 777045, 878169, 1098970, 633568, 652387, 95779...","{A1OLDCEB1ACEWM, A1RQWOYC6CFDVR, A2JJSOXYODDKK7, A216VDO5BCPX4X, A2BZAXY8F5GIEU, AZHU9PUMK3MJS, AIS6978KQ3UMJ, AHP46VAR2Z046, A3OX9E5Q99XRG2, AFZ5SZ3K3GAW9, A2Z2S20ZV7SJTX, A7472ORITX91P, A2ZKEPAT...","{B0127J7NWE, B00P8N7N0Y, B0071CVTAK, B00EE7EXVW, B00NM76PXY, B009GKIC8M, B015F9J82Q, B004G8QSTO, B01461MDXA, B00YFRNFY2, B00DQ7SAA6, B002R0JJY4, B00ZKFRKIU, B00B3YQG4Q, B00ST0RO6G, B003UT2C4U, B00...",25,3,Monitor,7.392823,28,28
24318,24318,little pricey,{1038527},{AKUKZI8OWEHE6},{B00YD3DBOC},1,0,Monitor,10.725028,1,1


# Trying with an unsupervised clustering model

NOTE: we can't use unsupervised clustering because we don't know how many clusters can be there 
- we first find a 5 or 10 nearest neighbors for each attribute
    - we then group/merge attributes together into clusters based on (and/or)
        - mean distance between neighbors across all neighbor sets
        - occurrence frequency of nearest neighbors
        
- we can cluster together, but also try to differentiate between positive and negative attribute clusters within the same semantic subspace
    - e.g. q = "Good color quality"
        - positive equivalent in subspace = "excellent color quality"
        - negative equivalent in subspace = "poor color quality"

In [145]:
monitor_reviews[monitor_reviews['review_id'] == 474147]['reviewText'].iloc[0]

"Abysmal viewing angles, contrast, and color - but its price reflects that.  It fills the niche of low-cost 1080p monitors quite well.  Keep in mind it doesn't have HDMI or DisplayPort support (But does support HDCP over DVI) and uses a non-standard power adapter (5V, included).  Stand is fairly sturdy."

In [147]:
key_phrases['phrase']

0                      game room
1                           easy
2                   good product
3                          stars
4                         thanks
                  ...           
30226             tablet surface
30227            thicker plastic
30228    outstanding works great
30229                broken wire
30230                    strands
Name: phrase, Length: 30231, dtype: object

## find nearest neighbors for each phrase

In [253]:
attributes = key_phrases['phrase'].tolist()
neighbor_clusters = []
for index, phrase in tqdm(key_phrases['phrase'].items()):
    qvec = phrase_vectors_arr[index]
    distances, neighbors = nn.kneighbors([qvec], n_neighbors=10)
    neighbor_clusters.append({
        'phrase_idx': index,
        'phrase': phrase,
        'neighbor_idx': neighbors[0],
        'neighbor_dist': distances[0],
        'neighbor_attr': [attributes[i] for i in neighbors[0]]
    })
    
    
    

0it [00:00, ?it/s]

In [181]:
neighbor_clusters[10]

{'phrase_idx': 10,
 'phrase': 'boat',
 'neighbor_idx': array([   10,  5789,   477, 22993,  9449,  2155, 27367, 10520,  4891,
        28177]),
 'neighbor_dist': array([0.       , 4.0086007, 4.3492165, 4.407573 , 4.5231104, 4.5345817,
        4.6084905, 4.678634 , 4.6942477, 4.745496 ], dtype=float32),
 'neighbor_attr': ['boat',
  'penalty',
  'family',
  'functionability',
  'driveway',
  'card',
  'donation',
  'body',
  'water',
  'degradation']}

## priority order for cluster generation
- to generate clusters, we can't really go first come first serve, or random, because the clustering could then be inconsistent with reality
- we can try to start with the phrases that occur most frequently in reviews, and go down the ranked list imposed by popularity
    - this is also easier to explain

In [188]:
priority_order = key_phrases.sort_values(['n_reviews', 'n_reviewers'], ascending=False)['key_phrase_id'].index.tolist()

In [192]:
priority_order[:10]

[3, 18, 12, 15, 229, 114, 36, 1, 606, 99]

## neighbor occurrence frequency
- we cluster those phrases together that have a 70% set overlap in their neighbor set

In [189]:
adjacency_matrix = np.zeros((len(priority_order), len(priority_order)))

for nc in neighbor_clusters:
    i = nc['phrase_idx']
    for j in nc['neighbor_idx']:
        adjacency_matrix[i][j] = 1


In [190]:
adjacency_matrix[:2]

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [220]:
phrase_ids = []
phrases = []
nns = []
new_nns = []

for p in tqdm(priority_order):
    q = neighbor_clusters[p]['phrase_idx']
    q_idx = neighbor_clusters[p]['neighbor_idx']
    subset = adjacency_matrix[:, q_idx]
    
    subset_sum = subset.sum(axis=1)
    
    close_idx = np.argwhere(subset_sum > 3)
    # print(neighbor_clusters[p]['phrase'])
    # print(neighbor_clusters[p]['neighbor_attr'])
    # print('new: ', key_phrases.iloc[close_idx.reshape(-1)]['phrase'].tolist())
    phrase_ids.append(p)
    phrases.append(neighbor_clusters[p]['phrase'])
    nns.append(neighbor_clusters[p]['neighbor_attr'])
    new_nns.append(key_phrases.iloc[close_idx.reshape(-1)]['phrase'].tolist())

similar_neighbor_occurrence_phrases = pd.DataFrame({
    'phrase_id': phrase_ids,
    'phrase': phrases,
    'neighbors': nns,
    'phrases_having_similar_neighbors': new_nns
})
    

  0%|          | 0/30231 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [218]:
similar_neighbor_occurrence_phrases.head()

Unnamed: 0,phrase_id,phrase,neighbors,phrases_having_similar_neighbors
0,3,stars,"[stars, replacements, presets, bolts, choices, vents, references, passengers, fingerprints, headsets]","[stars, presets, headsets, fingerprints, styles, toggles, forms, nicks, splices, vents, references, choices, casings, replacements]"
1,18,monitor,"[monitor, snap, hmdi, sleeve, displayport, psa, intercom, confuse, raspberry, manufacture]","[monitor, intercom, hmdi, connect, confuse, psa]"
2,12,great,"[great, big, horrendous, large, durable, popular, enormous, fit great, dangerous, esp]","[great, big, bad, huge, esp, horrendous]"
3,15,good,"[good, gorgeous, peerless, fond, special, indoor, fantastic, endless, superior, great]","[good, superior, gorgeous, indoor, inferior, fond]"
4,229,great monitor,"[great monitor, big monitor, handy monitor, different monitor, good monitor, expensive monitor, large monitor, nice monitor, real monitor, huge monitor]","[great monitor, large monitor, big monitor, handy monitor, different monitor, extra monitor, huge monitor, extraordinary monitor, wide monitor, actual monitor, great momitor]"
5,114,screen,"[screen, grass, boot, glass, bathroom, paperwork, btightness, driveway, game, calibration]","[screen, boot]"
6,36,price,"[price, persistence, policy, dissatisfaction, cage, voltage, tank, portability, violation, platinum]","[price, voltage, solution, ticket, peen, persistence, plywood]"
7,1,easy,"[easy, snappy, becareful, legible, electrical, incomplete, horrible, beautiful, noticeable, visible]",[easy]
8,606,nice,"[nice, inky, new, mandatory, malicious, genuine, great, wonderful, big, clear nice]","[big, nice, new, wonderful, deep, mandatory, inky, genuine]"
9,99,monitors,"[monitors, movies, prints, simulations, productions, differences, terms, perks, expeditions, choices]","[monitors, movies, terms, perks, differences, benefits, films, crafts, simulations]"


## distance based clusters
- we add those phrases to clusters that are within the average intra-cluster distance
- THIS method is MORE efficient, and the terms seems quite nice

In [222]:
cluster_dists = np.zeros(len(priority_order))
for nc in neighbor_clusters:
    i = nc['phrase_idx']
    cluster_dists[i] = nc['neighbor_dist'].mean()

In [223]:
all_clusters_mean = cluster_dists.mean()
all_clusters_mean

3.165849001115584

In [224]:
import plotly.express as px

In [225]:
px.histogram(cluster_dists)

In [226]:
np.median(cluster_dists)

2.9598581790924072

In [259]:
phrase_ids = []
phrases = []
nns = []
new_nns = []

for p in tqdm(priority_order):
    nc = neighbor_clusters[p]
    phrase_ids.append(p)
    phrases.append(nc['phrase'])
    nns.append(nc['neighbor_attr'])
    #print(nc['neighbor_idx'].shape)
    
    cluster_mean_dist = nc['neighbor_dist'].mean()
    lc = 1
    while cluster_mean_dist < all_clusters_mean and lc < 11:
        #print(cluster_mean_dist, all_clusters_mean)
        # we need to find more neighbors for this phrase until the mean intra-cluster distance is greater than the mean
        q_idx = nc['neighbor_idx']
        #print(q_idx)
        qvec = phrase_vectors_arr[q_idx].mean(axis=0)
        #print(qvec)
        distances, neighbors = nn.kneighbors([qvec], n_neighbors=10*lc)
        distances = distances[0]
        neighbors = neighbors[0]
        for knn in range(len(neighbors)):
            #print(knn, neighbors[knn])
            if neighbors[knn] not in nc['neighbor_idx']:
                #print('added')
                nc['neighbor_idx'] = np.append(nc['neighbor_idx'], neighbors[knn])
                nc['neighbor_dist'] = np.append(nc['neighbor_dist'], distances[knn])
        nc['neighbor_attr'] =  [attributes[i] for i in neighbors]
        cluster_mean_dist = nc['neighbor_dist'].mean()
        lc += 1

    new_nns.append(key_phrases.iloc[nc['neighbor_idx'].reshape(-1)]['phrase'].tolist())
    #print(nc['neighbor_idx'].shape)

mean_distance_occurrence_phrases = pd.DataFrame({
    'phrase_id': phrase_ids,
    'phrase': phrases,
    'neighbors': nns,
    'phrases_having_similar_neighbors': new_nns
})

  0%|          | 0/30231 [00:00<?, ?it/s]

In [269]:
mean_distance_occurrence_phrases.head(25)

Unnamed: 0,phrase_id,phrase,neighbors,phrases_having_similar_neighbors
0,3,stars,"[stars, replacements, presets, bolts, choices, vents, references, passengers, fingerprints, headsets]","[stars, replacements, presets, bolts, choices, vents, references, passengers, fingerprints, headsets]"
1,18,monitor,"[monitor, snap, hmdi, sleeve, displayport, psa, intercom, confuse, raspberry, manufacture]","[monitor, snap, hmdi, sleeve, displayport, psa, intercom, confuse, raspberry, manufacture]"
2,12,great,"[great, big, horrendous, large, durable, popular, enormous, fit great, dangerous, esp]","[great, big, horrendous, large, durable, popular, enormous, fit great, dangerous, esp]"
3,15,good,"[good, gorgeous, peerless, fond, special, indoor, fantastic, endless, superior, great]","[good, gorgeous, peerless, fond, special, indoor, fantastic, endless, superior, great]"
4,229,great monitor,"[spacious monitor, specific monitor, impressive monitor, terrible monitor, temporary monitor, big monitor, clever monitor, dependable monitor, sharp monitor, superior monitor, horrible monitor, in...","[great monitor, big monitor, handy monitor, different monitor, good monitor, expensive monitor, large monitor, nice monitor, real monitor, huge monitor, terrible monitor, actual monitor, impressiv..."
5,114,screen,"[screen, grass, boot, glass, bathroom, paperwork, btightness, driveway, game, calibration]","[screen, grass, boot, glass, bathroom, paperwork, btightness, driveway, game, calibration]"
6,36,price,"[price, persistence, policy, dissatisfaction, cage, voltage, tank, portability, violation, platinum]","[price, persistence, policy, dissatisfaction, cage, voltage, tank, portability, violation, platinum]"
7,1,easy,"[easy, snappy, becareful, legible, electrical, incomplete, horrible, beautiful, noticeable, visible]","[easy, snappy, becareful, legible, electrical, incomplete, horrible, beautiful, noticeable, visible]"
8,606,nice,"[nice, inky, new, mandatory, malicious, genuine, great, wonderful, big, clear nice]","[nice, inky, new, mandatory, malicious, genuine, great, wonderful, big, clear nice]"
9,99,monitors,"[prints, endorsements, ants, televisions, contents, meetings, monitors, productions, products, lines, limitations, properties, vents, directions, offices, questions, choices, peripherals, movies, ...","[monitors, movies, prints, simulations, productions, differences, terms, perks, expeditions, choices, benefits, endorsements, televisions, meetings, questions, places, lines, limitations, ants, pi..."


In [263]:
mean_distance_occurrence_phrases.to_pickle('mean_distance_occurrence_phrases.pkl')

In [257]:
mean_distance_occurrence_phrases['phrases_having_similar_neighbors'].values

array([list(['great monitor', 'big monitor', 'handy monitor', 'different monitor', 'good monitor', 'expensive monitor', 'large monitor', 'nice monitor', 'real monitor', 'huge monitor', 'terrible monitor', 'actual monitor', 'impressive monitor', 'sharp monitor', 'new monitor', 'temporary monitor', 'durable monitor', 'specific monitor', 'beautiful monitor', 'small monitor', 'remarkable monitor', 'spacious monitor', 'dependable monitor', 'clever monitor', 'heavy monitor', 'extraordinary monitor', 'superior monitor', 'competitive monitor', 'reasonable monitor', 'incredible monitor', 'fabulous monitor', 'functional monitor', 'wondrous monitor', 'horrible monitor', 'bad monitor', 'wonderful monitor', 'sick monitor', 'true monitor', 'professional monitor', 'wrong monitor', 'decent monitor', 'original monitor', 'old monitor', 'practical monitor', 'good inexpensive monitor', 'ergonomic monitor', 'modern monitor', 'inexpensive monitor', 'magnificent monitor', 'fantastic monitor', 'adjustable mon

# Phrases close to the given terms

In [260]:
laptop = 'Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast', 'long battery life', 'Noise level'
monitor = 'Easy to set up', 'Quality display', 'Good color quality', 'Quality build', 'Sound quality', 'Easy to use', 'Minimal glare', 'Lightweight', 'Good viewing angles', 'Fast'
headphone = 'Good sound quality', 'comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'effective noise cancelling', 'Good bass', 'Call quality', 'Attractive'
mouse = 'comfortable', 'long battery life', 'Easy to set up', 'Easy to use', 'Quality build', 'Attractive', 'Noise level', 'Easy to clean'
tv ='Easy to use', 'Easy to set up', 'Sound quality', 'Fast input response', 'lightweight', 'Attractive', 'quality build', 'lacks durability', 'speed', 'craftsmanship', 'camera quality', 'remote quality'

In [268]:
for qphrase in monitor:
    qvec = nlp(qphrase).vector
    _, __n = nn.kneighbors([qvec], n_neighbors=10)
    print(__n)
    print(qphrase)
    print( [attributes[i] for i in __n[0]])

[[12020  7589 18877 21085 17309 23563 19171 19177 21558 28605]]
Easy to set up
['awesome built-in surround sound', 'go-to monitor', 'excellent widescreen hp monitor', 'menu buttons faulty', 'nice freestanding stand', 'lg support unable', 'excellent built-in webcam', 'cost effective', 'expensive af', 'ok works good']
[[ 3756 21111 10550  9048 21219 30179 20828 18312 12656 13617]]
Quality display
['great picture display', 'gtx980 strix', 'quality display', 'great quality display', 'computer display', 'resolution display', 'nice quality display', 'great screen resolution', 'wide screen display', 'color display']
[[24138 14376 28118 27779 11638  3104 21910 10059 22017  3669]]
Good color quality
['great color definition', 'poor color quality', 'poor color accuracy', 'great video quality', 'nice video quality', 'excellent color quality', 'nice color definition', 'great color production', 'poor color rendition', 'good video quality']
[[11460 13956 11194  3093 16329 18830 11510 13618 19582 165

In [278]:
qvec = nlp('durability').vector
_, __n = nn.kneighbors([qvec], n_neighbors=10)
print(__n)
print(qphrase)
print( [attributes[i] for i in __n[0]])

[[ 1799  5790   132  1164 26156 27804  1122 30006 23119  2687]]
Fast
['durability', 'violation', 'flexibility', 'portability', 'malfunction', 'continuity', 'device', 'cage', 'coloration', 'configuration']
