# In this notebook

- We build API functions that return the newer shortlisted attributes
- We refer to notebooks `13`, `14`, `16`, and `17` as needed.


In [1]:
import pandas as pd
import json

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 1000

In [2]:
from sqlalchemy import create_engine
import psycopg2 
import io

In [3]:
import os
import glob

In [4]:
import pickle

In [5]:
import numpy as np

### set up database connection

In [6]:
conn_string = 'postgresql+psycopg2://gabbydbuser:gabbyDBpass@localhost:5432/gabbyDB'

In [7]:
db = create_engine(conn_string)
conn = db.connect()

In [60]:
CATEGORY = 'headphone'

# get_attributes_list() API

In [67]:
def get_attributes_list(category, n_qphrase_attrs=10):
    shortlisted_attributes_query = \
        f'''
        SELECT *
        FROM shortlisted_attributes
        WHERE category='{category}'
        '''
    shortlisted_attributes = pd.read_sql(shortlisted_attributes_query, conn)
    sim_attrs_list = \
        shortlisted_attributes. \
            sort_values('neighbor_distances').sort_values('n_reviews', ascending=False). \
                groupby('qphrase'). \
                    head(n_qphrase_attrs). \
                        reset_index(drop=True)[['key_phrase_id', 'phrase', 'qphrase']].sort_values('qphrase')
    sim_attrs_list_deduped =  sim_attrs_list[['key_phrase_id', 'phrase']].drop_duplicates()
    return sim_attrs_list_deduped.sample(min(50, sim_attrs_list.shape[0]))
    

In [68]:
get_attributes_list(CATEGORY, 5)

39


ValueError: Cannot take a larger sample than population when 'replace=False'

# Scratch

In [10]:
shortlisted_attributes_query = \
    f'''
    SELECT *
    FROM shortlisted_attributes
    WHERE category='{CATEGORY}'
    '''
shortlisted_attributes = pd.read_sql(shortlisted_attributes_query, conn)

In [13]:
shortlisted_attributes.shape

(386, 12)

In [33]:
sim_attrs_list = shortlisted_attributes.sort_values('neighbor_distances').sort_values('n_reviews', ascending=False).groupby('qphrase').head(10).reset_index(drop=True)[['key_phrase_id', 'phrase', 'qphrase']].sort_values('qphrase')
sim_attrs_list

Unnamed: 0,key_phrase_id,phrase,qphrase
33,331141,nice bass,awesome bass
15,328427,good bass,awesome bass
61,331635,great punchy bass,awesome bass
60,330317,mild bass,awesome bass
49,333649,decent bass,awesome bass
48,328397,big bass,awesome bass
29,328918,strong bass,awesome bass
12,328681,great bass,awesome bass
34,331766,heavy bass,awesome bass
62,333517,blissful bass,awesome bass


In [38]:
sim_attrs_list[['key_phrase_id', 'phrase']].drop_duplicates().sample(50)

Unnamed: 0,key_phrase_id,phrase
31,329045,quality product
15,328427,good bass
0,328250,good sound
12,328681,great bass
32,330168,great noise cancelling
36,332935,active noise cancellation
3,328821,good sound quality
39,329197,nice fit
16,329203,comfortable fit
7,328974,price range


In [21]:
sim_attr_lists['n_attrs'] = sim_attr_lists['phrase'].apply(lambda x: len(x))

In [16]:
#shortlisted_attributes.sort_values('n_reviews', ascending=False).sort_values('neighbor_distances').groupby('qphrase')['phrase'].apply(list).reset_index()

In [22]:
sim_attr_lists

Unnamed: 0,qphrase,phrase,n_attrs
0,awesome bass,"[great bass, good bass, strong bass, nice bass, heavy bass, big bass, decent bass, mild bass, great punchy bass, blissful bass, great bass response, great bass performance, great bass sound, reaso...",15
1,battery life,"[battery life, volume control, price range, volume level, quality product]",5
2,comfortable fit,"[good sound, good sound quality, great sound quality, excellent sound quality, comfortable fit, amazing sound quality, poor sound quality, great audio quality, good fit, good quality sound, high q...",122
3,effective noise cancelling,"[good battery life, great noise cancelling, active noise cancellation, good noise cancellation, active noise cancelling, good noise isolation, great customer support, good noise cancelling, great ...",124
4,good fit,"[great fit, comfortable fit, good fit, nice fit, poor fit, tight fit, excellent fit, bad fit, comfortable fit battery, new fit, decent fit]",11
5,good sound quality,"[good sound quality, great sound quality, excellent sound quality, amazing sound quality, poor sound quality, great audio quality, good quality sound, high quality sound, decent sound quality, goo...",100
6,is durable,"[works great, works good, sounds good, sounds excellent, gels tight]",5
7,light Weight,"[light weight, light usage, light weight easy, light music]",4
