In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import cufflinks as cf

import plotly.plotly as py
import plotly.tools as tls
import plotly.graph_objs as go
import plotly.offline as po

orig_df = pd.read_csv("winning_booths_data.csv")

top_categories = ['No Category', 'Fashion', 'Home & Garden', 'Health & Beauty', 'Parts & Accessories', 'Jewelry & Watches']
df = orig_df[orig_df.featured_category_id.isin(top_categories)]


In [2]:
no_category = df.profit_estimate[df.featured_category_id == 'No Category']
fashion = df.profit_estimate[df.featured_category_id == 'Fashion']
home_garden = df.profit_estimate[df.featured_category_id == 'Home & Garden']
health_beauty = df.profit_estimate[df.featured_category_id == 'Health & Beauty']
parts_accessories = df.profit_estimate[df.featured_category_id == 'Parts & Accessories']
jewelry_watches = df.profit_estimate[df.featured_category_id == 'Jewelry & Watches']
categories = [no_category, fashion, home_garden, health_beauty, parts_accessories, jewelry_watches]

In [3]:
size = len(df)
for category in categories:
    if len(category) < size:
        size = len(category)
print(size)

418


In [4]:
no_category = no_category.sample(size)
fashion = fashion.sample(size)
home_garden = home_garden.sample(size)
health_beauty = health_beauty.sample(size)
parts_accessories = parts_accessories.sample(size)
jewelry_watches = jewelry_watches.sample(size)

In [5]:
x_data = top_categories
y_data = categories

In [6]:
traces = []
colors = ['blue', 'orange', 'green', 'red', 'purple', 'yellow']
for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxmean=True,
            boxpoints = 'suspectedoutliers',
        ))

In [7]:
layout = go.Layout(
    title='Profit Distribution by Category',
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=False
)
fig = go.Figure(data=traces, layout=layout)

In [8]:
po.plot(fig)

'file:///Users/kevin/data_analysis/temp-plot.html'

In [40]:
opt = []
opts = []
for i in range(0, len(colors)):
    opt = dict(
        target = df.featured_category_id.unique()[i], value = dict(marker = dict(color = colors[i]))
    )
    opts.append(opt)

In [3]:
data = [dict(
  type = 'scatter',
  mode = 'markers',
  x = df['days_as_user'],
  y = df['profit_estimate'],
  text = (df['id']),
  hovermode='closest',  
  hoverinfo = 'text',
  opacity = 0.8,
  marker = dict(
      size = df['orders'],
      sizemode = 'area',
      sizeref = 2.*max(df['orders'])/(100.**2),
  ),
  transforms = [
      dict(
        type = 'groupby',
        groups = df['featured_category_id'],
        styles = opts
    )]
)]

In [4]:
layout = dict(
    yaxis = dict(
        
    )
)

In [5]:
po.plot({'data': data, 'layout': layout}, validate=False)

'file:///Users/bonanza/data_analysis/temp-plot.html'