In [26]:
import pandas as pd, numpy as np, plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
py.offline.init_notebook_mode(connected=True)
from plotly import tools
from plotly.graph_objs import *
%matplotlib inline
from math import floor
import networkx as nx

This notebook will focus on following types of plots using plotly
- Bubble Charts
- Bar Charts
- Network & Drop-down Menus (This is the hardest part & plotly's documentation is not enough)
- Geo-Visualizations (Cloropleths & Symbol Plots)

In [13]:
data = pd.read_csv("data/cleaned_movie.csv")
data["profit"] = data["revenue"] - data["budget"]

In [14]:
def extract_decade(x):
    return str(floor(x/10)*10)+"s"
data["decade"] = data["year"].apply(extract_decade)

In [15]:
data.head(3).transpose()

Unnamed: 0,0,1,2
budget,237000000,300000000,245000000
genres,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['Adventure', 'Fantasy', 'Action']","['Action', 'Adventure', 'Crime']"
keywords,"['culture clash', 'future', 'space war', 'spac...","['ocean', 'drug abuse', 'exotic island', 'east...","['spy', 'based on novel', 'secret agent', 'seq..."
original_language,en,en,en
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha...",A cryptic message from Bond’s past sends him o...
popularity,150.438,139.083,107.377
production_companies,"['Ingenious Film Partners', 'Twentieth Century...","['Walt Disney Pictures', 'Jerry Bruckheimer Fi...","['Columbia Pictures', 'Danjaq', 'B24']"
production_countries,"['United States of America', 'United Kingdom']",['United States of America'],"['United Kingdom', 'United States of America']"
release_date,2009-12-10,2007-05-19,2015-10-26
revenue,2787965087,961000000,880674609


### Bar Plots - Vote Count for Each Decade

In [20]:
df_by_vote = data.groupby(['decade']).vote_count.sum().reset_index()[4:]

In [21]:
bar_data = [go.Bar(x=df_by_vote['decade']\
                   , y=df_by_vote["vote_count"])]

py.offline.iplot({ 'data': bar_data,
            'layout': {
               'title': 'Vote Count for each decade',
               'xaxis': {
                 'title': 'Decade'},
               'yaxis': {
                'title': 'Total Votes'}
        }})

### Violin Plots - Movie Ratings by Decade

In [23]:
da = []
for i in range(0,len(pd.unique(data['decade']))):
    trace = {
            "type": 'violin',
            "x": data['decade'][data['decade'] == pd.unique(data['decade'])[i]],
            "y": data['vote_average'][data['decade'] == pd.unique(data['decade'])[i]],
            "name": pd.unique(data['decade'])[i],
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            }
        }
    da.append(trace)

        
fig = {
    "data": da,
    "layout" : {
        "title": "Average Movie Ratings by Decade",
            "xaxis" : dict(title = 'Decade', autotick=False, showticklabels=True),
            "yaxis" : dict(title = 'Average Rating')
    }
}

iplot(fig, validate = False)

### Bubble Chart - Profit vs Budget (Colored by Rating)

In [25]:
layout = go.Layout(
    title='Profit vs. Budget',
    xaxis=dict(
        title='Budget',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 4e8],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Profit',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    )
)
data2 = [go.Scatter(
    x=data[data.budget>5000].budget.values, # Budget
    y=data[data.budget>5000].profit.values,  # Gross
    mode='markers',
    text= data[data.budget>5000].title.values, # Movie Titles
    marker=dict(
        size=3*(data[data.budget>5000].vote_average),
        sizeref=1.0,
        color=data[data.budget>5000].vote_average.values,
                
        colorbar=ColorBar(title='Average Rating<br> &nbsp;' , tickvals=[0,1.5,3,5,7,8.5]),

        showscale=True,
        colorscale='Viridis'
    ))]
fig = go.Figure(data=data2, layout=layout)
iplot(fig)

### Network Visualization

#### Subsetting the data to get important actors

In [27]:
df_appearance = data[['actor1', 'year']].groupby('actor1').count().reset_index()

most_prolific = list(df_appearance['actor1'][np.array(df_appearance['year'] > 3)])

subset1 = data[data.actor1.isin(most_prolific) &\
            data.actor2.isin(most_prolific) & data.actor3.isin(most_prolific)].reset_index(drop = True)


#### Creating Network from Subset Data

In [38]:
pair = []
for i in range(subset1.shape[0]):
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor2"]))
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor3"]))
    pair.append((subset1.loc[i,"actor2"],subset1.loc[i,"actor3"]))

In [39]:
G=nx.Graph()
G.add_edges_from(pair)
nodes = list(G.node)
d = dict(nx.degree(G))
imp_actors = sorted(d, key=d.get)[201:]

In [40]:
pos=nx.kamada_kawai_layout(G)  

In [41]:
Xv=[pos[k][0] for k in nodes]
Yv=[pos[k][1] for k in nodes]
Xed=[]
Yed=[]
for edge in pair:
    Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
    Yed+=[pos[edge[0]][1],pos[edge[1]][1], None] 
    
trace3=Scatter(x=Xed,
               y=Yed,
               mode='lines',
               line=Line(color='rgb(210,210,210)', width=1),
               hoverinfo='none'
               )
trace4=Scatter(x=Xv,
               y=Yv,
               mode='markers',
               name='net',
               marker=Marker(symbol='dot',
                             size=10, 
               showscale=True,
               colorscale = 'Viridis',
               reversescale = False,
                             
               colorbar=ColorBar(title='Number of Connections<br> &nbsp;', tickvals=[2,4,6,8,10,12,14,16,18,20,22,24]),

               color= list(d.values()),
               line=Line(color='rgb(50,50,50)', width=0.5)
               ),
               text=nodes,
               hoverinfo='text'
               )
    
    

layout=Layout(title= "Leading Actors and their Connections",  
    font= Font(size=12),
    showlegend=False, 
    xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False),
    margin=Margin(
        l=40,
        r=40,
        b=85,
        t=100,
    ),         
    )

data1=Data([trace3, trace4])
fig1=Figure(data=data1, layout=layout)
iplot(fig1)

### Drop-Down Menu

In [42]:
imp_actors = sorted(d, key=d.get)[201:]

In [44]:
features = []
for item in imp_actors:
    group = data.loc[(data['actor1'] == item) | (data['actor2']== item) ]
    features.append((item, group.profit.mean(), group.vote_average.mean(), group.popularity.mean(), d[item]))

In [45]:
top15 = pd.DataFrame(features)
top15.columns = ["Actor","Avg. Profit", "Avg. Vote", "Avg. Popularity", "Connections with Other Important Actors)"]
top15

Unnamed: 0,Actor,Avg. Profit,Avg. Vote,Avg. Popularity,Connections with Other Important Actors)
0,Scarlett Johansson,29165150.0,6.4375,33.564579,13
1,Russell Crowe,76908230.0,6.6,33.467027,13
2,Cameron Diaz,120614500.0,5.994118,38.642755,14
3,Tom Cruise,236936500.0,6.646154,48.156016,14
4,Brad Pitt,135349500.0,6.813793,48.674368,14
5,Matt Damon,98843630.0,6.636364,36.726481,14
6,Meryl Streep,83846530.0,6.482609,25.091077,14
7,Christian Bale,138891700.0,6.935,55.938757,15
8,Jude Law,59987340.0,6.371429,29.928448,15
9,Kate Winslet,144835600.0,7.007143,30.104512,15


In [47]:
top15['Actor1'] = top15['Actor']+ " ("+ top15['Connections with Other Important Actors)'].astype('str') + ")"

In [48]:
trace1 = go.Bar(    y=top15["Actor1"],
                        orientation = 'h',
                        x=top15["Avg. Profit"]/1000000,
                        name='Profit',
                        marker=dict(
                    color='rgb(161,215,106)'
                ))
                        

trace2 = go.Bar(y=top15["Actor1"],
                   orientation = 'h',
                    x=top15["Avg. Popularity"],
                    marker=dict(
                        color='rgb(37,52,148)'
                        #color='rgb(65,182,196)'
                    ),
                    name='Popularity')
                

trace3 = go.Bar(y=top15["Actor1"],
                   orientation = 'h',
                    x=top15["Avg. Vote"]*10,
                    name='Vote',
                    marker=dict(
                        #color='rgb(37,52,148)'
                        color='rgb(65,182,196)'
                    )
               )


data = [trace1, trace2, trace3]


updatemenus = list([
    dict(active=-1,
         x=-0.3,
         buttons=list([  
             
            dict(
                label = 'Average Profit (in Millions)',
                 method = 'update',
                 args = [{'visible': [True, False, False]}, 
                     {'title': 'Average Profit'}]),
             
             dict(
                  label = 'Average Popularity',
                 method = 'update',
                 args = [{'visible': [False, True, False]},
                     {'title': 'Average Popularity'}]),

            dict(
                 label = 'Average Vote',
                 method = 'update',
                 args = [{'visible': [False, False, True]},
                     {'title': 'Average Vote'}])
        ]),
    )
])

layout = dict(title='Average Measures for Important Actors (Select from Dropdown)', showlegend=False,
              updatemenus=updatemenus)

fig = dict(data=data, layout=layout)

iplot(fig)

## Geo-Visualizations

### Processing Data for GeoVisualizations

In [2]:
df = pd.read_csv('2017.csv')
codes = pd.read_csv("2014_world_gdp_with_codes.csv")

codes.columns = ["Country","Gdp","Code"]
df = df.merge(codes,how = "left", on = ["Country"] )
df['happiness score 2015'] = pd.read_csv("2015.csv")["Happiness Score"]

lat = pd.read_csv("lat.csv")
lat.columns = ["code","latitude","longitude","Country"]
df = df.merge(lat[["latitude","longitude","Country"]], on = ["Country"], how = "left")

### Cloropleths of Happiness Score Country-wise

In [4]:
data = [ dict(
        type = 'choropleth',
        locations = df['Code'],
        z = df['Happiness.Score'],
        text = df['Country'],
        colorscale = [[2.7,"rgb(5, 10, 172)"],[3.6,"rgb(40, 60, 190)"],[4.5,"rgb(70, 100, 245)"],\
            [5.4,"rgb(90, 120, 245)"],[6.3,"rgb(106, 137, 247)"],[7.2,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
                    line = dict (
                    color = 'rgb(180,180,180)',
                    width = 0.5 ) ),
        colorbar = dict( 
                      autotick = False,
                      title = 'Happiness Score'),
                      ) ]

layout = dict(
    title = 'Happiness Score',
    geo = dict( 
        showframe = False,
        showcoastlines = False,
        projection = dict(type = 'Mercator')
    )
)

fig = dict( data=data, layout=layout )
iplot(fig, validate = False)

### ScatterGeo Plots/Symbol Maps of Relationship between GDP & Happiness Rank

In [5]:
df['text'] = df['Country'] + '<br>GDP ' + (df['Gdp']).astype(str)+' billion'
limits = [(0,30),(31,60),(61,90),(91,120),(121,160)]
colors = ["blue","green","yellow","rgb(255,65,54)","rgb(133,20,75)"]
countries = []
scale = 10

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df[lim[0]:lim[1]]
    city = dict( 
        type = 'scattergeo',
        locationmode = 'World-Map',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['text'],
        marker = dict( 
            size = df_sub['Gdp']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area',
            title = 'Happiness Rank'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    countries.append(city)

layout = dict(
        title = '2017 World Happiness Rank (Size Proportional to GDP) <br>(Click legend to toggle traces)',
        showlegend = True,
        geo = dict(
            scope=None,
            projection=dict( type='Mercator' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=countries, layout=layout )
iplot(fig, validate = False)