# Data Gathering Project: Amazon Reviews
### MATH-GA 2047-001 Data Science in Quantetative Finance
Group Member Names:  

**Author A: Binqian Zeng **

**NetID: bz866**

**Email: bz866@nyu.edu**
<br>
<br>

**Author B: Bailey Griswold **  
**NetID: bg1672**  
**Email: bg1672@nyu.edu**  
<br>
<br>
**Author C: Shouxi Wei **  
**NetID: sw3558**  
**Email: sw3558@nyu.edu**
<br>
<br>

BQPlotting  
Part 1:  
What are the top 200 most reviewed items in each category?  
Part 2:  
Use BQPlot MarketMap widget and have a grid map of the top 200 most reviewed items at a given point of time.  
Hovering over each cell on the map shows a plot of the number of reviews as a function of time for that item.  
Using either a slider or using an IndexSelector on top of the chart of the total number of reviews, have the user dynamically select the calendar date at which the MarketMap shows the top 200 most reviewed items. In general you expect the list of top 200 to be different as the calendar date changes.

In [1]:
import pandas as pd
import numpy as np
import os

from bqplot import (
    LogScale, LinearScale, ColorScale, OrdinalColorScale, ColorAxis,
    Axis, Scatter, Lines, CATEGORY10, Label, Figure, Tooltip, DateScale, toolbar
)

from bqplot.market_map import MarketMap

from ipywidgets import HBox, VBox, IntSlider, Play, jslink, Layout

In [40]:
# define category
cat = 'Beauty'

In [41]:
# read in data
df = pd.read_pickle('./DataFrame_Pickle/df_reviews_'+cat+'.pkl')
df_products = pd.read_pickle('./DataFrame_Pickle/df_products_'+cat+'.pkl')

In [42]:
# Get a count of reviews as a function of time for all products

# count occurances of reviews for each date
date_counts = pd.DataFrame(df.groupby('productid').timestamp.value_counts())

#sort dates for each product
date_counts.sort_index(level='productid', inplace=True)

#rename count column from timestamp to date tally
date_counts.rename(columns={'timestamp':'date_tally'}, inplace=True)

#alternate way to make the cumulative counts
date_counts.reset_index(inplace=True) #resets index
tally = date_counts.set_index(['timestamp','productid']).unstack(level=1).copy() #transposes df

tally.columns = tally.columns.droplevel(0) #drops multiindex from columns
tally = tally.cumsum() #cumulatively sums the daily counts

tally = tally.fillna(method='ffill', axis=0)

In [43]:
def get_top200(year):
    df_yr = df[df['timestamp'].dt.year <= year]
    counts = pd.DataFrame(df_yr.groupby(['productid']).size()).sort_values(by=0, ascending=False)
    counts.rename(columns={0:'total_reviews'}, inplace=True)
    
    
    df_names = pd.merge(counts, df_products, left_index=True, right_on='productid', how='left')
    df_names.drop(['imUrl'], axis=1, inplace=True)
    
    top_200 = df_names.head(200)
    return top_200.productid.values, top_200

In [44]:
initial_year = 1995

In [45]:
# Creating the figure to be displayed as the tooltip
sc_x = DateScale()
sc_y = LinearScale()

ax_x = Axis(scale=sc_x, grid_lines='dashed', label='Date')
ax_y = Axis(scale=sc_y, orientation='vertical', grid_lines='dashed',
         label='Number of Reviews')

line = Lines(x= tally.index.values, y=[], scales={'x': sc_x, 'y': sc_y}, colors=['orange'])
fig_tooltip = Figure(marks=[line], axes=[ax_x, ax_y])

In [51]:
# make the market map for the initial year of reviews
products, data = get_top200(1995)
col = ColorScale(scheme='Greens')
ax_c = ColorAxis(scale=col, label='tally', visible=False)
market_map = MarketMap(names=data.productid.values,      
                       # basic data which needs to set for each map
                       ref_data=data,            
                       # Data frame which can be used for different properties of the map
                       # Axis and scale for color data
                        tooltip_widget=fig_tooltip,
                       scales={'color': col}, axes=[ax_c],
                       layout=Layout(min_width='1000px', min_height='800px'))

market_map.colors = ['MediumSeaGreen']
market_map.font_style = {'font-size': '10px', 'fill':'white'}
market_map.title = cat
market_map.title_style = {'fill': 'Red'}

# Update the tooltip chart
hovered_symbol = ''
def hover_handler(self, content):
    global hovered_symbol
    symbol = content.get('data', '') #what is this function? what is data?
    
    if(symbol != hovered_symbol):
        hovered_symbol = symbol
        if(tally.get(hovered_symbol) is not None):
            line.y = tally[hovered_symbol].values
            fig_tooltip.title = content.get('ref_data', {}).get('title', '')
            


In [52]:
# Add in a slider bar that responds to being changed
year_slider = IntSlider(min=1995, max=2014, step=1, description='Year', value=initial_year)

def year_changed(change):
    market_map.names, market_map.ref_data = get_top200(year_slider.value)
    #year_label.text = [str(year_slider.value)]

year_slider.observe(year_changed, 'value')

In [53]:
# Show the interative plot
market_map.on_hover(hover_handler)
VBox([HBox([year_slider]), market_map])

A Jupyter Widget

In [11]:
# define category
cat = 'Elec'

In [12]:
# read in data
df = pd.read_pickle('./DataFrame_Pickle/df_reviews_'+cat+'.pkl')
df_products = pd.read_pickle('./DataFrame_Pickle/df_products_'+cat+'.pkl')

In [13]:
# Get a count of reviews as a function of time for all products

# count occurances of reviews for each date
date_counts = pd.DataFrame(df.groupby('productid').timestamp.value_counts())

#sort dates for each product
date_counts.sort_index(level='productid', inplace=True)

#rename count column from timestamp to date tally
date_counts.rename(columns={'timestamp':'date_tally'}, inplace=True)

#alternate way to make the cumulative counts
date_counts.reset_index(inplace=True) #resets index
tally = date_counts.set_index(['timestamp','productid']).unstack(level=1).copy() #transposes df

tally.columns = tally.columns.droplevel(0) #drops multiindex from columns
tally = tally.cumsum() #cumulatively sums the daily counts

tally = tally.fillna(method='ffill', axis=0)

In [14]:
def get_top200(year):
    df_yr = df[df['timestamp'].dt.year <= year]
    counts = pd.DataFrame(df_yr.groupby(['productid']).size()).sort_values(by=0, ascending=False)
    counts.rename(columns={0:'total_reviews'}, inplace=True)
    
    
    df_names = pd.merge(counts, df_products, left_index=True, right_on='productid', how='left')
    df_names.drop(['imUrl'], axis=1, inplace=True)
    
    top_200 = df_names.head(200)
    return top_200.productid.values, top_200

In [15]:
initial_year = 1995

In [16]:
# Creating the figure to be displayed as the tooltip
sc_x = DateScale()
sc_y = LinearScale()

ax_x = Axis(scale=sc_x, grid_lines='dashed', label='Date')
ax_y = Axis(scale=sc_y, orientation='vertical', grid_lines='dashed',
         label='Number of Reviews')

line = Lines(x= tally.index.values, y=[], scales={'x': sc_x, 'y': sc_y}, colors=['orange'])
fig_tooltip = Figure(marks=[line], axes=[ax_x, ax_y])

In [17]:
# make the market map for the initial year of reviews
products, data = get_top200(1995)
col = ColorScale(scheme='Greens')
ax_c = ColorAxis(scale=col, label='tally', visible=False)
market_map = MarketMap(names=data.productid.values,      
                       # basic data which needs to set for each map
                       ref_data=data,            
                       # Data frame which can be used for different properties of the map
                       # Axis and scale for color data
                        tooltip_widget=fig_tooltip,
                       scales={'color': col}, axes=[ax_c],
                       layout=Layout(min_width='1000px', min_height='800px'))

market_map.colors = ['MediumSeaGreen']
market_map.font_style = {'font-size': '10px', 'fill':'white'}
market_map.title = cat
market_map.title_style = {'fill': 'Red'}

# Update the tooltip chart
hovered_symbol = ''
def hover_handler(self, content):
    global hovered_symbol
    symbol = content.get('data', '') #what is this function? what is data?
    
    if(symbol != hovered_symbol):
        hovered_symbol = symbol
        if(tally.get(hovered_symbol) is not None):
            line.y = tally[hovered_symbol].values
            fig_tooltip.title = content.get('ref_data', {}).get('title', '')
            


In [18]:
# Add in a slider bar that responds to being changed
year_slider = IntSlider(min=1995, max=2014, step=1, description='Year', value=initial_year)

def year_changed(change):
    market_map.names, market_map.ref_data = get_top200(year_slider.value)
    #year_label.text = [str(year_slider.value)]

year_slider.observe(year_changed, 'value')

In [19]:
# Show the interative plot
market_map.on_hover(hover_handler)
VBox([HBox([year_slider]), market_map])

A Jupyter Widget

In [20]:
# define category
cat = 'Books'

In [21]:
# read in data
df = pd.read_pickle('./DataFrame_Pickle/df_reviews_'+cat+'.pkl')
df_products = pd.read_pickle('./DataFrame_Pickle/df_products_'+cat+'.pkl')

FileNotFoundError: [Errno 2] No such file or directory: './DataFrame_Pickle/df_reviews_Books.pkl'

In [None]:
# Get a count of reviews as a function of time for all products

# count occurances of reviews for each date
date_counts = pd.DataFrame(df.groupby('productid').timestamp.value_counts())

#sort dates for each product
date_counts.sort_index(level='productid', inplace=True)

#rename count column from timestamp to date tally
date_counts.rename(columns={'timestamp':'date_tally'}, inplace=True)

#alternate way to make the cumulative counts
date_counts.reset_index(inplace=True) #resets index
tally = date_counts.set_index(['timestamp','productid']).unstack(level=1).copy() #transposes df

tally.columns = tally.columns.droplevel(0) #drops multiindex from columns
tally = tally.cumsum() #cumulatively sums the daily counts

tally = tally.fillna(method='ffill', axis=0)

In [None]:
def get_top200(year):
    df_yr = df[df['timestamp'].dt.year <= year]
    counts = pd.DataFrame(df_yr.groupby(['productid']).size()).sort_values(by=0, ascending=False)
    counts.rename(columns={0:'total_reviews'}, inplace=True)
    
    
    df_names = pd.merge(counts, df_products, left_index=True, right_on='productid', how='left')
    df_names.drop(['imUrl'], axis=1, inplace=True)
    
    top_200 = df_names.head(200)
    return top_200.productid.values, top_200

In [None]:
initial_year = 1995

In [None]:
# Creating the figure to be displayed as the tooltip
sc_x = DateScale()
sc_y = LinearScale()

ax_x = Axis(scale=sc_x, grid_lines='dashed', label='Date')
ax_y = Axis(scale=sc_y, orientation='vertical', grid_lines='dashed',
         label='Number of Reviews')

line = Lines(x= tally.index.values, y=[], scales={'x': sc_x, 'y': sc_y}, colors=['orange'])
fig_tooltip = Figure(marks=[line], axes=[ax_x, ax_y])

In [None]:
# make the market map for the initial year of reviews
products, data = get_top200(1995)
col = ColorScale(scheme='Greens')
ax_c = ColorAxis(scale=col, label='tally', visible=False)
market_map = MarketMap(names=data.productid.values,      
                       # basic data which needs to set for each map
                       ref_data=data,            
                       # Data frame which can be used for different properties of the map
                       # Axis and scale for color data
                        tooltip_widget=fig_tooltip,
                       scales={'color': col}, axes=[ax_c],
                       layout=Layout(min_width='1000px', min_height='800px'))

market_map.colors = ['MediumSeaGreen']
market_map.font_style = {'font-size': '10px', 'fill':'white'}
market_map.title = cat
market_map.title_style = {'fill': 'Red'}

# Update the tooltip chart
hovered_symbol = ''
def hover_handler(self, content):
    global hovered_symbol
    symbol = content.get('data', '') #what is this function? what is data?
    
    if(symbol != hovered_symbol):
        hovered_symbol = symbol
        if(tally.get(hovered_symbol) is not None):
            line.y = tally[hovered_symbol].values
            fig_tooltip.title = content.get('ref_data', {}).get('title', '')
            


In [None]:
# Add in a slider bar that responds to being changed
year_slider = IntSlider(min=1995, max=2014, step=1, description='Year', value=initial_year)

def year_changed(change):
    market_map.names, market_map.ref_data = get_top200(year_slider.value)
    #year_label.text = [str(year_slider.value)]

year_slider.observe(year_changed, 'value')

In [None]:
# Show the interative plot
market_map.on_hover(hover_handler)
VBox([HBox([year_slider]), market_map])

In [22]:
# define category
cat = 'Cell'

In [23]:
# read in data
df = pd.read_pickle('./DataFrame_Pickle/df_reviews_'+cat+'.pkl')
df_products = pd.read_pickle('./DataFrame_Pickle/df_products_'+cat+'.pkl')

In [24]:
# Get a count of reviews as a function of time for all products

# count occurances of reviews for each date
date_counts = pd.DataFrame(df.groupby('productid').timestamp.value_counts())

#sort dates for each product
date_counts.sort_index(level='productid', inplace=True)

#rename count column from timestamp to date tally
date_counts.rename(columns={'timestamp':'date_tally'}, inplace=True)

#alternate way to make the cumulative counts
date_counts.reset_index(inplace=True) #resets index
tally = date_counts.set_index(['timestamp','productid']).unstack(level=1).copy() #transposes df

tally.columns = tally.columns.droplevel(0) #drops multiindex from columns
tally = tally.cumsum() #cumulatively sums the daily counts

tally = tally.fillna(method='ffill', axis=0)

In [25]:
def get_top200(year):
    df_yr = df[df['timestamp'].dt.year <= year]
    counts = pd.DataFrame(df_yr.groupby(['productid']).size()).sort_values(by=0, ascending=False)
    counts.rename(columns={0:'total_reviews'}, inplace=True)
    
    
    df_names = pd.merge(counts, df_products, left_index=True, right_on='productid', how='left')
    df_names.drop(['imUrl'], axis=1, inplace=True)
    
    top_200 = df_names.head(200)
    return top_200.productid.values, top_200

In [26]:
initial_year = 1995

In [27]:
# Creating the figure to be displayed as the tooltip
sc_x = DateScale()
sc_y = LinearScale()

ax_x = Axis(scale=sc_x, grid_lines='dashed', label='Date')
ax_y = Axis(scale=sc_y, orientation='vertical', grid_lines='dashed',
         label='Number of Reviews')

line = Lines(x= tally.index.values, y=[], scales={'x': sc_x, 'y': sc_y}, colors=['orange'])
fig_tooltip = Figure(marks=[line], axes=[ax_x, ax_y])

In [28]:
# make the market map for the initial year of reviews
products, data = get_top200(1995)
col = ColorScale(scheme='Greens')
ax_c = ColorAxis(scale=col, label='tally', visible=False)
market_map = MarketMap(names=data.productid.values,      
                       # basic data which needs to set for each map
                       ref_data=data,            
                       # Data frame which can be used for different properties of the map
                       # Axis and scale for color data
                        tooltip_widget=fig_tooltip,
                       scales={'color': col}, axes=[ax_c],
                       layout=Layout(min_width='1000px', min_height='800px'))

market_map.colors = ['MediumSeaGreen']
market_map.font_style = {'font-size': '10px', 'fill':'white'}
market_map.title = cat
market_map.title_style = {'fill': 'Red'}

# Update the tooltip chart
hovered_symbol = ''
def hover_handler(self, content):
    global hovered_symbol
    symbol = content.get('data', '') #what is this function? what is data?
    
    if(symbol != hovered_symbol):
        hovered_symbol = symbol
        if(tally.get(hovered_symbol) is not None):
            line.y = tally[hovered_symbol].values
            fig_tooltip.title = content.get('ref_data', {}).get('title', '')
            


In [29]:
# Add in a slider bar that responds to being changed
year_slider = IntSlider(min=1995, max=2014, step=1, description='Year', value=initial_year)

def year_changed(change):
    market_map.names, market_map.ref_data = get_top200(year_slider.value)
    #year_label.text = [str(year_slider.value)]

year_slider.observe(year_changed, 'value')

In [30]:
# Show the interative plot
market_map.on_hover(hover_handler)
VBox([HBox([year_slider]), market_map])

A Jupyter Widget