In [1]:
import numpy as np
import pandas as pd
import plotly

from preprocessing import *

toy_dict = load_obj("pop_movies")

df1 = pd.DataFrame(toy_dict).T

# rename columns
df1.columns=['title','year','content_rating','length','genres','score','metascore',
            'vote_numbers','gross','director','actors','genre']

# drop columns
df1.drop(['genres','metascore','actors'], axis=1, inplace=True)

# Drop duplicated movies
df1.title.drop_duplicates(inplace=True)
# Drop movies with any NaN
df1.replace('', np.nan, inplace=True)
df1.dropna(axis=0,how='any', inplace=True)

In [2]:
# delete min in length col
df1['length'] = [df1['length'][i][:-3] for i in range(len(df1['length']))]
# delete '$' and 'M' in gross col
df1['gross'] = [df1['gross'][i][1:-1] for i in range(len(df1['gross']))]
# delete non-integer in year col
df1['year'] = df1['year'].map(lambda x : ''.join([i for i in x if i.isdigit()]))

df1 = df1.convert_dtypes()
df1 = df1.astype({'length': 'int64','gross': 'float','score': 'float','year':'int32'})
df1.dtypes

title              string
year                int32
content_rating     string
length              int64
score             float64
vote_numbers       string
gross             float64
director           string
genre              string
dtype: object

In [3]:
df1.describe()

Unnamed: 0,year,length,score,gross
count,9502.0,9502.0,9502.0,9502.0
mean,1998.608503,107.635866,6.483109,30.544057
std,17.340524,20.90272,0.992997,58.335459
min,1914.0,9.0,1.4,0.0
25%,1990.0,94.0,5.9,0.77
50%,2003.0,104.0,6.6,8.725
75%,2012.0,117.0,7.2,35.1875
max,2020.0,566.0,9.3,936.66


In [6]:
df1.head()

Unnamed: 0,title,year,content_rating,length,score,vote_numbers,gross,director,genre
tt6723592,Tenet,2020,PG-13,150,7.7,158304,53.8,Christopher Nolan,action
tt4633694,Spider-Man: Into the Spider-Verse,2018,PG,117,8.4,357781,190.24,"Bob Persichetti, Peter Ramsey, Rodney Rothman",action
tt4154796,Avengers: Endgame,2019,PG-13,181,8.4,783537,858.37,"Anthony Russo, Joe Russo",action
tt1477834,Aquaman,2018,PG-13,143,6.9,361542,335.06,James Wan,action
tt2527338,Star Wars: Episode IX - The Rise of Skywalker,2019,PG-13,141,6.6,359823,515.2,J.J. Abrams,action


In [10]:
df1['year'].unique()

array([2020, 2018, 2019, 2016, 2001, 1988, 2008, 2010, 2000, 2017, 1999,
       1965, 2006, 1984, 2012, 1977, 2014, 2003, 2009, 2005, 2007, 1997,
       2015, 1981, 1994, 1990, 2002, 1989, 1993, 1986, 2011, 2013, 1996,
       2004, 1982, 1987, 1967, 1980, 1991, 1995, 1983, 1979, 1992, 1998,
       1963, 1971, 1962, 1973, 1969, 1964, 1985, 1954, 1974, 1978, 1960,
       1976, 1959, 1968, 1972, 1956, 1970, 1961, 1927, 1966, 1975, 1938,
       1953, 1926, 1958, 1940, 1924, 1932, 1939, 1957, 1955, 1933, 1948,
       1934, 1951, 1925, 1943, 1942, 1952, 1937, 1950, 1941, 1946, 1928,
       1923, 1944, 1945, 1931, 1947, 1936, 1921, 1949, 1922, 1915, 1916,
       1935, 1929, 1917, 1930, 1914])

In [4]:
# Drop years before 2015
df2 = df1[~(df1['year'] < 2015)]
df2.head(10)

Unnamed: 0,title,year,content_rating,length,score,vote_numbers,gross,director,genre
tt6723592,Tenet,2020,PG-13,150,7.7,158304,53.8,Christopher Nolan,action
tt4633694,Spider-Man: Into the Spider-Verse,2018,PG,117,8.4,357781,190.24,"Bob Persichetti, Peter Ramsey, Rodney Rothman",action
tt4154796,Avengers: Endgame,2019,PG-13,181,8.4,783537,858.37,"Anthony Russo, Joe Russo",action
tt1477834,Aquaman,2018,PG-13,143,6.9,361542,335.06,James Wan,action
tt2527338,Star Wars: Episode IX - The Rise of Skywalker,2019,PG-13,141,6.6,359823,515.2,J.J. Abrams,action
tt1571234,Mortal Engines,2018,PG-13,128,6.1,104767,15.95,Christian Rivers,action
tt1431045,Deadpool,2016,R,108,8.0,892619,363.07,Tim Miller,action
tt5463162,Deadpool 2,2018,R,119,7.7,470113,324.59,David Leitch,action
tt4154756,Avengers: Infinity War,2018,PG-13,149,8.4,815703,678.82,"Anthony Russo, Joe Russo",action
tt7713068,Birds of Prey: And the Fantabulous Emancipatio...,2020,R,109,6.1,154955,84.16,Cathy Yan,action


In [7]:
df1[df1['genre'] == 'action']

Unnamed: 0,title,year,content_rating,length,score,vote_numbers,gross,director,genre
tt6723592,Tenet,2020,PG-13,150,7.7,158304,53.80,Christopher Nolan,action
tt4633694,Spider-Man: Into the Spider-Verse,2018,PG,117,8.4,357781,190.24,"Bob Persichetti, Peter Ramsey, Rodney Rothman",action
tt4154796,Avengers: Endgame,2019,PG-13,181,8.4,783537,858.37,"Anthony Russo, Joe Russo",action
tt1477834,Aquaman,2018,PG-13,143,6.9,361542,335.06,James Wan,action
tt2527338,Star Wars: Episode IX - The Rise of Skywalker,2019,PG-13,141,6.6,359823,515.20,J.J. Abrams,action
...,...,...,...,...,...,...,...,...,...
tt4717402,MFKZ,2017,R,94,6.7,3342,0.23,"Shôjirô Nishimi, Guillaume Renard",action
tt0490181,Mutant Chronicles,2008,R,111,5.2,25542,0.01,Simon Hunter,action
tt1620933,Paan Singh Tomar,2012,Not Rated,135,8.2,32865,0.04,Tigmanshu Dhulia,action
tt6836936,Saaho,2019,Not Rated,170,5.2,15307,2.63,Sujeeth,action


In [10]:
df1['genre'].unique()

<StringArray>
[     'action',   'adventure',   'animation',   'biography',      'comedy',
       'crime', 'documentary',       'drama',      'family',     'fantasy',
   'film-noir',     'history',      'horror',       'music',     'musical',
     'mystery',     'romance',      'sci-fi',       'short',       'sport',
    'thriller',         'war',     'western']
Length: 23, dtype: string

In [None]:
# app.py
# -*- coding: utf-8 -*-

# !pip install dash

import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import plotly.graph_objects as go
from dash.dependencies import Input, Output


#external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
external_stylesheets = ['https://unpkg.com/wingcss']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)


# see https://plotly.com/python/px-arguments/ for more options
# Drop years before 2015
df2 = df1[~(df1['year'] < 2015)]
df2.head(10)

# App layout
app.layout = html.Div([

    html.H1("Movies", style={'text-align': 'center'}),
    html.Div([
    html.Div(children='''
        Graph showing relationship between score and vote numbers of different genres of movies.
    '''),

    dcc.Dropdown(id="slct_year",
                 options=[
                     {"label": "2015", "value": 2015},
                     {"label": "2016", "value": 2016},
                     {"label": "2017", "value": 2017},
                     {"label": "2018", "value": 2018},
                     {"label": "2019", "value": 2019},
                     {"label": "2020", "value": 2020}],
                 multi=False,
                 value=2015,
                 style={'width': "50%"}
                 ),
    html.Br(),
    html.Div(id='output_container', children=[]),
    html.Br(),

    dcc.Graph(id='movie_scatter', figure={})],style={'width': '75%', 'display': 'inline-block'}),
    html.Div( #smaller now moved up beside the first block
    [
        html.I("Search a movie to view its score trend:"),
        html.Br(),
        dcc.Input(id="input1", type="text", placeholder="Your movie title..."),
        html.Div(id="output"),
    ],style={'width': '20%', 'display': 'inline-block', 'margin':'auto'}),

    html.Div(html.H2("Movie info per genre", style={'text-align': 'center'})),

    html.Div([
        dcc.Markdown('''
        This table shows 9502 trending movies from all genre.
        
        Use the filter box in each column to filter data.
        
        Genres include:
        *action, adventure, animation, biography, comedy, crime, 
        documentary, drama, family, fantasy, film-noir, history, horror, music, musical, 
        mystery, romance, sci-fi, short, sport, thriller, war, western*
        '''),  
        dash_table.DataTable(
        id='datatable-interactivity',
        columns=[
            {"name": i, "id": i, "deletable": True, "selectable": True}
            for i in df1.columns
        ],

        data=df1.to_dict('records'),  # the contents of the table
        filter_action="native",     # allow filtering of data by user ('native') or not ('none')
        sort_action="native",       # enables data to be sorted per-column by user or not ('none')
        sort_mode="single",         # sort across 'multi' or 'single' columns
        row_deletable=True,         # choose if user can delete a row (True) or not (False)
        page_action="native",       # all data is passed to the table up-front or not ('none')
        page_current=0,             # page number that user is on
        page_size=8,                # number of rows visible per page
        style_cell={                # ensure adequate header width when text is shorter than cell's text
            'textAlign': 'left','minWidth': 95, 'maxWidth': 95, 'width': 95
        },
        style_data={                # overflow cells' content into multiple lines
            'whiteSpace': 'normal',
            'height': 'auto',
        'lineHeight': '15px'
        }
    ),html.Br()]),
    html.Hr(),
    html.Div([
        
        html.Br(),
        dcc.Tabs([
            dcc.Tab(label='Action', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'action'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )
            ]),
            dcc.Tab(label='Adventure', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'adventure'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Animation', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'animation'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Biography', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'biography'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Comedy', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'comedy'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Crime', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'crime'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Drama', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'drama'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Family', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'family'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),

            dcc.Tab(label='Fantasy', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'fantasy'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Horror', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'horror'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Music', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'music'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )]),
            dcc.Tab(label='Romance', children=[
                dcc.Graph(
                    figure=px.scatter(df1[df1['genre'] == 'romance'], x="length", y="score", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
                )
            ])
        ])
    ])
])


# ------------------------------------------------------------------------------
# Connect the Plotly graphs with Dash Components
@app.callback(
    [Output(component_id='output_container', component_property='children'),
     Output(component_id='movie_scatter', component_property='figure')],
    [Input(component_id='slct_year', component_property='value')]
)
def update_graph(option_slctd):

    container = "Year of {}".format(option_slctd)

    dff = df2.copy()
    dff = dff[dff["year"] == option_slctd]

    # Plotly Express
    fig = px.scatter(dff, x="vote_numbers", y="score",
                 size="gross", color="genre", hover_name="title",
                 log_x=True, size_max=60)

    return container, fig

if __name__ == '__main__':
    app.run_server()

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [03/Dec/2020 02:45:22] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 02:45:22] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 02:45:22] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 02:45:22] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 02:46:35] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 02:46:37] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 02:47:53] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 02:47:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 03:22:03] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 03:22:04] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Dec/2020 03:22:05] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.

In [9]:
df1.to_json(r'pop_movies.json')