In [106]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import plotly.express as px
import warnings
#!pip install folium
warnings.filterwarnings("ignore", category=DeprecationWarning)

url = "https://raw.githubusercontent.com/adibmenchali/DataVizProject/master/movies.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Title,Genre,Year,Rating,Director,Cast,Duration,Language,Country of Origin,Revenue,Budget
0,Avatar: The Way of Water,Science Fiction,2022.0,7.752,James Cameron,"Sam Worthington, Zoe Saldaña, Sigourney Weaver...",192.0,en,United States of America,2310416000.0,460000000.0
1,Creed III,Drama,2023.0,7.29,Michael B. Jordan,"Michael B. Jordan, Tessa Thompson, Jonathan Ma...",116.0,en,United States of America,258000000.0,75000000.0
2,Winnie the Pooh: Blood and Honey,Horror,2023.0,5.913,Rhys Frake-Waterfield,"Craig David Dowsett, Chris Cordell, Amber Doig...",84.0,en,United Kingdom,3200000.0,100000.0
3,Mummies,Animation,2023.0,7.167,Juan Jesús García Galocha,"Óscar Barberán, Ana Esther Alborg, Luis Pérez ...",88.0,es,Spain,34200000.0,12300000.0
4,John Wick: Chapter 4,Action,2023.0,8.036,Chad Stahelski,"Keanu Reeves, Donnie Yen, Bill Skarsgård, Ian ...",169.0,en,United States of America,244878300.0,90000000.0


In [107]:
len(df.Genre.unique())

20

## Average profit per Genre

In [108]:
# Filter out rows with null revenue values
filtered_df = df[df['Revenue'].notna()]

# Calculate the average benefit per genre and sort by revenue
benefit_by_genre = (filtered_df['Revenue'] - filtered_df['Budget']).groupby(filtered_df['Genre']).mean().reset_index().sort_values(0, ascending=False)

# Create the plot
fig = px.line(benefit_by_genre, x='Genre', y=0, title='Average profit per Genre')
fig.show()


## Most profitable directors

In [109]:
# Filter out rows with null revenue or budget values
filtered_df = df[(df['Revenue'].notna()) & (df['Budget'].notna())]

# Calculate the profit for each movie
filtered_df['Profit'] = filtered_df['Revenue'] - filtered_df['Budget']

# Calculate the profit by director and sort by profit
profit_by_director = filtered_df.groupby('Director')['Profit'].sum().reset_index().sort_values('Profit', ascending=False).head(10)

# Create the plot
fig = px.bar(profit_by_director, x='Director', y='Profit', title='Total Profit by Director (Top 10)')

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [110]:
df = df.rename(columns={'Country of Origin': 'country'})
df.drop(["Revenue", "Budget"], axis=1, inplace=True)
df.dropna(inplace=True)
print(f"Null values:\n{df.isnull().sum()}\n")
print(f"Duplicate values:\n{df.duplicated().sum()}")
df.head()

Null values:
Title       0
Genre       0
Year        0
Rating      0
Director    0
Cast        0
Duration    0
Language    0
country     0
dtype: int64

Duplicate values:
0


Unnamed: 0,Title,Genre,Year,Rating,Director,Cast,Duration,Language,country
0,Avatar: The Way of Water,Science Fiction,2022.0,7.752,James Cameron,"Sam Worthington, Zoe Saldaña, Sigourney Weaver...",192.0,en,United States of America
1,Creed III,Drama,2023.0,7.29,Michael B. Jordan,"Michael B. Jordan, Tessa Thompson, Jonathan Ma...",116.0,en,United States of America
2,Winnie the Pooh: Blood and Honey,Horror,2023.0,5.913,Rhys Frake-Waterfield,"Craig David Dowsett, Chris Cordell, Amber Doig...",84.0,en,United Kingdom
3,Mummies,Animation,2023.0,7.167,Juan Jesús García Galocha,"Óscar Barberán, Ana Esther Alborg, Luis Pérez ...",88.0,es,Spain
4,John Wick: Chapter 4,Action,2023.0,8.036,Chad Stahelski,"Keanu Reeves, Donnie Yen, Bill Skarsgård, Ian ...",169.0,en,United States of America


In [111]:
top_directors = df.groupby('Director').size().reset_index(name='counts').sort_values('counts', ascending=False).head(10)

result = df[df['Director'].isin(top_directors['Director'])][['Director', 'Title','Cast', 'Genre','country','Year','Rating','Duration']]

#Interactive Visualizations

In [112]:
from geopy.geocoders import Nominatim

top_movies = df.groupby('country')['country','Rating', 'Genre','Title','Director','Year'].apply(lambda x: x.nlargest(3, columns=['Rating'])).reset_index(drop=True)
top_movies = top_movies.rename(columns={'country': 'name'})
# Create a geolocator object
geolocator = Nominatim(user_agent='my_app')

# Define a function to get the latitude and longitude for a given location
def get_lat_long(location):
    try:
        # Use the geolocator to get the latitude and longitude
        location = geolocator.geocode(location)
        return location.latitude, location.longitude
    except:
        # Return None if the location cannot be found
        return None, None

# Apply the function to each row in your dataframe
top_movies['latitude'], top_movies['longitude'] = zip(*top_movies['name'].apply(get_lat_long))


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [113]:
!pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##Highest rated movies by Country

In [114]:
import plotly.express as px
from geopy.geocoders import Nominatim
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster, Search, MiniMap, MeasureControl

# Load the world map shapefile
world_map = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Merge the movie data with the world map data
merged_data = world_map.merge(top_movies, on='name')

# Create a dictionary to store genres
genres = {}
for genre in merged_data['Genre'].unique():
    genres[genre] = folium.FeatureGroup(name=genre)

def get_color(feature):
    genre = feature['properties']['Genre']
    colors = {
        'Action': 'red',
        'Adventure': 'orange',
        'Animation': 'pink',
        'Comedy': 'green',
        'Crime': 'purple',
        'Documentary': 'brown',
        'Drama': 'blue',
        'Family': 'lightblue',
        'Fantasy': 'darkpurple',
        'History': 'beige',
        'Horror': 'darkred',
        'Music': 'lightgreen',
        'Mystery': 'darkblue',
        'Romance': 'pink',
        'Science Fiction': 'darkgreen',
        'Thriller': 'black',
        'War': 'gray'
    }
    return colors.get(genre, 'gray')

# Create a map object using Folium
m = folium.Map(location=[37.0902, -95.7129], zoom_start=2,max_bounds=True)

# Add all FeatureGroups to map object
for genre in genres.values():
    genre.add_to(m)

# Create a dictionary to store GeoJson layers for each genre
genre_layers = {}
for genre in merged_data['Genre'].unique():
    genre_layers[genre] = folium.GeoJson(
        merged_data[merged_data['Genre'] == genre],
        name=genre,
        style_function=lambda feature: {
            'fillColor': get_color(feature),
            'color': 'black',
            'weight': 2,
            'dashArray': '5, 2'
        },
        popup=folium.features.GeoJsonPopup(
            fields=['Title', 'Director', 'Rating'], 
            aliases=['Movie', 'Director', 'Rating'], 
            localize=True,
            labels=True,
            style="background-color: yellow;"
        )
    )

# Add GeoJson layers for each genre to the map
for layer in genre_layers.values():
    layer.add_to(m)

# Add LayerControl with checkboxes for each genre
folium.LayerControl(collapsed=False, overlay=False).add_to(m)

m
#m.save('map.html')



The geopandas.dataset module is deprecated and will be removed in GeoPandas 1.0. You can get the original 'naturalearth_lowres' data from https://www.naturalearthdata.com/downloads/110m-cultural-vectors/.



In [115]:
# convert Actors column to list
result['Actors'] = result['Cast'].str.split(',').tolist()

# get first item and put it into a new column called Actor
result['Actor1'] = result['Actors'].str[0]
result['Actor2'] = result['Actors'].str[1]

result.head()

Unnamed: 0,Director,Title,Cast,Genre,country,Year,Rating,Duration,Actors,Actor1,Actor2
0,James Cameron,Avatar: The Way of Water,"Sam Worthington, Zoe Saldaña, Sigourney Weaver...",Science Fiction,United States of America,2022.0,7.752,192.0,"[Sam Worthington, Zoe Saldaña, Sigourney Wea...",Sam Worthington,Zoe Saldaña
53,James Cameron,Avatar,"Sam Worthington, Zoe Saldaña, Sigourney Weaver...",Action,United States of America,2009.0,7.6,162.0,"[Sam Worthington, Zoe Saldaña, Sigourney Wea...",Sam Worthington,Zoe Saldaña
88,Sam Raimi,Doctor Strange in the Multiverse of Madness,"Benedict Cumberbatch, Elizabeth Olsen, Chiwete...",Fantasy,United States of America,2022.0,7.392,126.0,"[Benedict Cumberbatch, Elizabeth Olsen, Chiw...",Benedict Cumberbatch,Elizabeth Olsen
111,Anthony Russo,Avengers: Infinity War,"Robert Downey Jr., Chris Hemsworth, Mark Ruffa...",Adventure,United States of America,2018.0,8.263,149.0,"[Robert Downey Jr., Chris Hemsworth, Mark Ru...",Robert Downey Jr.,Chris Hemsworth
159,David Yates,Harry Potter and the Half-Blood Prince,"Daniel Radcliffe, Rupert Grint, Emma Watson, M...",Adventure,United Kingdom,2009.0,7.698,153.0,"[Daniel Radcliffe, Rupert Grint, Emma Watson...",Daniel Radcliffe,Rupert Grint


##Map of top directors and their movies

In [116]:
import plotly.express as px

fig = px.treemap(result, path=[px.Constant("Movies"),'Genre', 'Director','Title'], values='Rating',
                  color='Genre', hover_data=['Year'],
                  color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(result['Rating'], weights=result['Duration']),
                  custom_data=[result['Title'], result['Cast']])

fig.update_traces(textfont_size=16)

fig.show()
fig_html = fig.to_html(full_html=False)
with open("movies_plot.html", "w") as f:
    f.write(fig_html)

##Map of top directors and the actors they work

In [117]:
import plotly.express as px

fig = px.treemap(result, path=[px.Constant("Actors"),'Director','Actor1', 'Actor2'], values='Rating',
                  color='Rating', hover_data=['Year'],
                  color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(result['Rating'], weights=result['Duration']))

fig.update_traces(textfont_size=16)

fig.show()
fig_html = fig.to_html(full_html=False)
with open("actors_plot.html", "w") as f:
    f.write(fig_html)

In [118]:
df["Type"] = df["Duration"].apply(lambda x: "Short" if x <= 40 else "Normal")

##Top 10 Directors by number of movies

In [119]:
import plotly.express as px
import pandas as pd

# Get the top 10 directors and genres
genres = df['Genre'].value_counts()
directors = df['Director'].value_counts().head(10)
df_top = df[df['Director'].isin(directors.index) & df['Genre'].isin(genres.index)]

# Group by director and genre
grouped = df_top.groupby(['Director', 'Genre']).size().unstack()

# Sort the data by the sum of each row
grouped = grouped.loc[grouped.sum(axis=1).sort_values(ascending=False).index]

# Plot the data
colors = px.colors.qualitative.Pastel

fig = px.bar(grouped, barmode='stack', color_discrete_sequence=colors)
fig.update_layout(
    title='Top 10 Directors by Number of Movies and Genre',
    xaxis_title='Director',
    yaxis_title='Movies Directed',
    legend_title='Genre'
)

# Save the figure as an HTML file
#fig.write_html('top_10_directors.html')


##Top 10 Directors by average rating

In [120]:
# Filter the dataframe to include only directors with more than 1 movie
df_filtered = df.groupby('Director').filter(lambda x: len(x) > 1)

# Calculate the mean rating for each director
directors = df_filtered.groupby(['Director', 'Genre'])['Rating'].mean().sort_values(ascending=False).reset_index()
top_directors = directors.groupby('Director').first().sort_values('Rating', ascending=False).head(10)

# Create a bar plot
fig = px.bar(top_directors, x=top_directors.index, y='Rating', color='Genre', color_discrete_sequence=px.colors.qualitative.Pastel)

# Add labels and title
fig.update_layout(title='Top 10 Directors by Average Rating and Genre', xaxis_title='Director', yaxis_title='Average rating')

# Rotate the x-axis labels
fig.update_xaxes(tickangle=45)

# Add legend
fig.update_layout(
    title='Top 10 Directors by Average Rating',
    xaxis_title='Director',
    yaxis_title='Average Rating',
    legend_title='Genre'
)

# Show the plot
fig.show()
# Save the figure as an HTML file
#fig.write_html('top_rated_directors.html')

## Average Rating by Genre

In [121]:
# Get a list of distinct colors
colors = px.colors.qualitative.Alphabet

# Group by genre and calculate mean rating and count
genre_ratings = df.groupby('Genre')['Rating'].agg(['mean', 'count']).reset_index()

# Create the scatter plot
fig = px.scatter(genre_ratings, x='count', y='mean', color='Genre', size='count', hover_data=['Genre'], color_discrete_sequence=colors)

# Update the layout of the plot
fig.update_layout(
    title='Average Rating by Genre',
    xaxis_title='Number of Movies',
    yaxis_title='Average Rating',
    plot_bgcolor='white'
)

# Show the plot
fig.show()

##Average Duration by Genre

In [122]:
import plotly.express as px

df_grouped = df.groupby(["Genre"]).mean(numeric_only=True).reset_index()
df_grouped = df_grouped.sort_values(by="Duration", ascending=False)

# Create a bar chart
fig = px.bar(df_grouped, x="Genre", y="Duration",
             title="Average Duration by Genre")
fig.show()


##Average Rating by Country of Origin

In [123]:
df_grouped = df.groupby(["country"]).mean(numeric_only=True).reset_index()
df_grouped = df_grouped.sort_values(by="Rating", ascending=False)

# Create a bar chart
fig = px.bar(df_grouped, x="country", y="Rating",
             title="Average Rating by Country of Origin")
fig.show()

##Average rating based on duration

In [124]:
bins = [0, 60, 120, 180, 240]
labels = ["Less than an hour", "Between 1 and 2 hours", "Between 2 and 3 hours", "More than 3 hours"]
df["duration_group"] = pd.cut(df["Duration"], bins=bins, labels=labels)

dfx = df.groupby("duration_group")["Rating"].mean().reset_index()

fig = px.scatter(dfx, x="duration_group", y="Rating", title="Average Rating based on Duration")
fig.update_layout(plot_bgcolor="white")

fig2 = px.line(dfx, x="duration_group", y="Rating", title="Average Rating vs Duration of Movie")
fig2.update_layout(plot_bgcolor="white")
fig.update_xaxes(title_text="Duration Group")
fig.update_xaxes(title_text="Duration Group")
fig.update_yaxes(title_text="Average Rating")


fig.add_trace(fig2.data[0])

fig.show()

#Dash App

Libraries to install:

In [125]:
!pip install dash==2.3.1 
!pip install plotly
!pip install "notebook>=1.0" jupyterlab-dash==0.1.0a3
!pip install jupyter-dash

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [126]:
import dash
from dash import dcc
from dash import html
from jupyter_dash import JupyterDash
from dash.dependencies import Output, Input
import plotly.express as px
import plotly.graph_objs as go


external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css',
                        {'href': 'https://stackpath.bootstrapcdn.com/bootswatch/4.3.1/darkly/bootstrap.min.css',
                         'rel': 'stylesheet',
                         'integrity': 'sha384-3m0vgqZlDlTwS2YPZ9OI8Y+haxTtTGVgCK+eTtVoHrgOaKzvL+AZs/9zXZp+H79d',
                         'crossorigin': 'anonymous'}
                       ]# Create the app
app = JupyterDash(__name__,external_stylesheets=external_stylesheets)

# Define the styles for the graphs
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
graph_style = {'height': '450px', 'margin-bottom': '50px'}

# Create the app
app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

# Define the layout
app.layout = html.Div([
    dcc.Dropdown(
        id='dpdn1',
        value=['United States of America','United Kingdom'],
        multi=True,
        options=[{'label': x, 'value': x} for x in df.country.unique()]
    ),
    html.Div([
        dcc.RangeSlider(
            id='slider',
            value=[2000, 2023],
            min=df.Year.min(),
            max=df.Year.max(),
            marks={str(year): str(year) for year in df.Year.unique()},
            step=None,
            tooltip={'always_visible': True}
        )
    ], style={'position': 'sticky', 'top': 0, 'z-index': '1'}),
    html.Div([
        dcc.Graph(
            id='pie-graph',
            figure={},
            className='six columns pie-graph',
            style=graph_style
        ),
        dcc.Graph(
            id='line-graph',
            figure={},
            className='six columns line-graph',
            style={'height': '450px', 'margin-bottom': '50px'}
        )
    ], className='row'),
    dcc.Graph(
        id='map-graph',
        figure={},
        className='map-graph',
        style={'height': '80vh'}
    ),
   
    dcc.Graph(
        id='bar-graph',
        figure={},
        className='bar-graph',
        style=graph_style
    ),
     dcc.Dropdown(
        id='dpdn2',
        value='Action',
        options=[{'label': x, 'value': x} for x in df.Genre.unique()]
    ),
    dcc.Graph(
        id='bar-graph2',
        figure={},
        className='bar-graph',
        style=graph_style
    )
])



@app.callback(
    Output(component_id='pie-graph', component_property='figure'),
    Input(component_id='dpdn1', component_property='value'),
    Input(component_id='slider', component_property='value')
)
def update_pie_graph(country_chosen, year_interval):
    dff = df[df.country.isin(country_chosen)]
    dff = dff[(dff.Year >= year_interval[0]) & (dff.Year <= year_interval[1])]
    dff = dff.groupby(['Genre'])['country'].count().reset_index()
    dff = dff.rename(columns={'country': 'Count'})
    fig = px.pie(data_frame=dff, values='Count', names='Genre',
                 title='Genre distribution for selected countries and years')
    return fig

@app.callback(
    Output(component_id='line-graph', component_property='figure'),
    Input(component_id='dpdn1', component_property='value'),
    Input(component_id='slider', component_property='value')
)
def update_line_graph(country_chosen, year_interval):
    dff = df[df.country.isin(country_chosen)]
    dff = dff[(dff.Year >= year_interval[0]) & (dff.Year <= year_interval[1])]
    dff = dff.groupby(['country', 'Year'])['Genre'].count().reset_index()
    fig = px.line(data_frame=dff, x='Year', y='Genre', color='country',
                 title='Number of movies produced per country and year')
    return fig


@app.callback(
    Output(component_id='map-graph', component_property='figure'),
    Input(component_id='dpdn1', component_property='value'),
    Input(component_id='slider', component_property='value')
)
def update_map_graph(country_chosen, year_interval):
    dff = df[df.country.isin(country_chosen)]
    dff = dff[(dff.Year >= year_interval[0]) & (dff.Year <= year_interval[1])]
    dff = dff.groupby(['country']).size().reset_index(name='Count')
    fig = px.choropleth(data_frame=dff, locations='country', locationmode='country names',
                        color='Count', title='Number of movies produced per country',
                        color_continuous_scale=px.colors.sequential.Sunsetdark
                        )
    fig.update_layout(
        margin={"r": 0, "t": 0, "l": 0, "b": 0},
        coloraxis_colorbar=dict(
            xanchor='right',
            x=0.85,
            len=0.7
        ),
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='equirectangular'
        ),
        height=500
    )
    return fig


# Define the callback function for the bar graph
@app.callback(
    Output(component_id='bar-graph', component_property='figure'),
    Input(component_id='dpdn2', component_property='value'),
    Input(component_id='dpdn1', component_property='value'),
    Input(component_id='slider', component_property='value')
)
def update_bar_graph(genre_chosen, country_chosen, year_interval):
    # Filter the data based on the selected genre
    dff = df[df.Genre == genre_chosen]
    dff = dff[dff.country.isin(country_chosen)]
    dff = dff[(dff.Year >= year_interval[0]) & (dff.Year <= year_interval[1])]

    # Count the number of movies each director has made
    director_counts = dff['Director'].value_counts().reset_index()
    director_counts.columns = ['Director', 'Count']

    # Get the top 10 directors
    top_directors = director_counts.head(10)

    # Create the bar graph
    fig = go.Figure(
        go.Bar(
            x=top_directors['Director'],
            y=top_directors['Count']
        )
    )

    # Update the layout of the bar graph
    fig.update_layout(
        title=f'Top 10 directors for {genre_chosen} movies',
        xaxis_title='Director',
        yaxis_title='Number of movies',
        plot_bgcolor='white'
    )

    return fig

@app.callback(
    Output(component_id='bar-graph2', component_property='figure'),
    Input(component_id='dpdn2', component_property='value'),
    Input(component_id='dpdn1', component_property='value'),
    Input(component_id='slider', component_property='value')
)
def update_bar_graph2(genre_chosen, country_chosen, year_interval):
    # Filter the data based on the selected genre, country, and year range
    dff = df[df.Genre == genre_chosen]
    dff = dff[dff.country.isin(country_chosen)]
    dff = dff[(dff.Year >= year_interval[0]) & (dff.Year <= year_interval[1])]

    # Group the data by director and calculate the average rating for each director
    director_ratings = dff.groupby('Director')['Rating'].mean().reset_index()
    director_ratings.columns = ['Director', 'Rating']

    # Get the top 10 directors
    top_directors = director_ratings.sort_values('Rating', ascending=False).head(10)

    # Define a color scale for the genres
    colorscale = px.colors.sequential.Cividis

    # Create the bar graph
    fig = go.Figure(
        go.Bar(
            x=top_directors['Director'],
            y=top_directors['Rating'],
            marker=dict(
                color=colorscale
                )
            )
        )

    # Update the layout of the bar graph
    fig.update_layout(
        title=f'Top 10 directors for {genre_chosen} movies based on rating',
        xaxis_title='Director',
        yaxis_title='Average rating',
        plot_bgcolor='white'
    )

    return fig



if __name__ == '__main__':
    app.run_server(debug=False)


 * Running on http://127.0.0.1:8050
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [06/May/2023 21:06:03] "GET /_alive_98db7516-ff8a-4f50-8e8c-79ddb8d44562 HTTP/1.1" 200 -


Dash app running on:


<IPython.core.display.Javascript object>