Import Pandas, Numpy, Matplotlib and Seaborn.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Read the csv file.

In [None]:
df_movies = pd.read_csv('data/films.csv')
df_movies

Removing non alphanumeric characters from the movie titles using a function using replace.

In [None]:
def normalize_title(title):
    characters_to_remove = r'[!@#$%^&*<>?/():;",.~`]: '
    for c in characters_to_remove:
        title = title.replace(c, '')
    return title.lower()
df_movies

Remove NaN genres with `notna` plus a misplaced _Chad S.Taylor_ genre.

In [None]:
df_movies = df_movies[df_movies.Genres.notna()]
df_movies = df_movies[~df_movies.Genres.str.contains('Chad S. Taylor')]
df_movies

Transform Rotten Tomatoes registers into the same rating than Imdb by using rstrip and astype.

In [None]:
df_movies['Rotten Tomatoes'] = df_movies['Rotten Tomatoes'].str.rstrip('%').astype(float) * 0.1
df_movies['Av_Rating'] = df_movies[['IMDb','Rotten Tomatoes']].mean(axis=1, skipna=True)
df_movies

From here, data frame movies is going to be splitted into two different data frames, french_movies and non_french_movies. Rows are filtered based on whether their _Country_ field contains the string _France_, regardless of any other countries present (would be the case of co-productions). NaN values in the _Country_ column are removed in advance by using `notna`.

In [None]:
french_movies = df_movies[df_movies.Country.notna()]
french_movies = french_movies[french_movies.Country.str.contains('France')]
french_movies.head()

In [None]:
non_french_movies = df_movies[df_movies.Country.notna()]
non_french_movies = non_french_movies[~non_french_movies.Country.str.contains('France')]
non_french_movies.head()

Create a histogram with Matplotlib comparing runtime from french_movies and non_french_movies

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11,15)

plt.hist(non_french_movies.Runtime, label="Non French movies", bins=50, alpha= 0.7, color='blue', density=True)
plt.hist(french_movies.Runtime, label='French movies', bins=50, alpha= 0.7, color='orange', density=True)
fig.suptitle('Fig1: Comparison French and non French movies by runtime',fontweight='bold')
plt.legend(['Non French','French'],loc='upper right')
plt.show()

Create a histogram with Matplotlib comparing rating from french_movies and non_french_movies

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11,15)

plt.hist(non_french_movies.Av_Rating, label="Non French movies", bins=50, alpha= 0.7, color='blue', density=True)
plt.hist(french_movies.Av_Rating, label='French movies', bins=50, alpha= 0.7, color='orange', density=True)
fig.suptitle('Fig2: Comparison French and non French movies by rating',fontweight='bold')
plt.legend(['Non French','French'],loc='upper right')
plt.show()

Piechart with matplotlib using colors from seaborn. The function `get_genre_df` takes a movies data frame and produces a new one with one genre per row, plus a fixed _Count_ column used as input to the pie chart.

In [None]:
def get_genre_df(movies_df):
    result_df = pd.DataFrame(columns=['Genre', 'Count'])
    for i, row in movies_df.iterrows():
        for genre in str(row.Genres).split(','):
            result_df.loc[len(result_df.index)] = [genre, 1]
    return result_df

french_genre_df = get_genre_df(french_movies).groupby(['Genre']).count()
genres = list(set(french_genre_df.index.values))

colors = sns.color_palette('pastel')

fig, ax = plt.subplots()
fig.set_size_inches(10,10)
plt.pie(french_genre_df.Count, labels=genres, colors=colors, autopct='%.02f%%')
fig.suptitle('Fig 3a: Percentage French Films by genre',fontweight='bold')
plt.show()

In [None]:
def get_genre_df(movies_df):
    result_df = pd.DataFrame(columns=['Genre', 'Count'])
    for i, row in movies_df.iterrows():
        for genre in str(row.Genres).split(','):
            result_df.loc[len(result_df.index)] = [genre, 1]
    return result_df

non_french_genre_df = get_genre_df(non_french_movies).groupby(['Genre']).count()
genres = list(set(non_french_genre_df.index.values))

colors = sns.color_palette('pastel')

fig, ax = plt.subplots()
fig.set_size_inches(10,10)
plt.pie(non_french_genre_df.Count, labels=genres, colors=colors, autopct='%.02f%%')
fig.suptitle('Fig 3b: Percentage non French Films by genre',fontweight='bold')
plt.show()

Distribution by genres using Kdeplot in seaborn. 

In [None]:
def get_genre_timeline_df(movies_df):
    result_df = pd.DataFrame(columns=['Genre', 'Year'])
    for i, row in movies_df.iterrows():
        for genre in str(row.Genres).split(','):
            result_df.loc[len(result_df.index)] = [genre, row.Year]
    return result_df

movies_genre_timeline_df = get_genre_timeline_df(df_movies)

# sns.set(rc={'figure.figsize': (20, 20)})
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(movies_genre_timeline_df, row="Genre", hue="Genre", aspect=15, height=.5, palette=pal).set(title='Distribution genres by timeline')

# Draw the densities in a few steps
g.map(sns.kdeplot, "Year",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "Year", clip_on=False, color="w", lw=2, bw_adjust=.5)

# passing color=None to refline() uses the hue mapping
g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)


# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    text = ax.text(0, .2, label, fontsize='x-large', fontweight="bold", color=color, ha="left", va="center", transform=ax.transAxes)

g.map(label, "Year")

# Set the subplots to overlap
# g.figure.subplots_adjust(hspace=0)
g.figure.set_size_inches(10, 20)
g.figure.tight_layout()

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)

Set of boxenplots figures in seaborn, showing a number of quantiles defined as a letter values. The `boxenplot_by_genre` is a generic function that generates a boxenplot chart per individual genre of a given column, and allows us to generate different charts without having to repeat similar code each time. Diamonds represent the outliers. Also, genres has been sorted alphabetically.

In [None]:
def get_info_by_genre(movies_df, column_name):
    result_df = pd.DataFrame(columns=['Genre', column_name])
    for i, row in movies_df.iterrows():
        for genre in str(row.Genres).split(','):
            result_df.loc[len(result_df.index)] = [genre, row[column_name]]
    return result_df

def boxenplot_by_genre(movies_df, column_name, title='', color='b'):
    column_by_genre_df = get_info_by_genre(movies_df, column_name)

    genres = list(column_by_genre_df.groupby(['Genre']).mean().sort_values(by=column_name, ascending=False).index.values)
    genres = sorted(set(column_by_genre_df.Genre.values))  # sort alphabetically

    sns.set(rc={"figure.figsize":(20, 15)})
    sns.boxenplot(data=column_by_genre_df, y="Genre", x=column_name, color=color, k_depth=4, order=genres, orient='h', scale="linear").set(title=title)
   
    

In [None]:
boxenplot_by_genre(df_movies, 'Runtime', title='Fig4: Distribution of films by genre and rating', color='r')

In [None]:
boxenplot_by_genre(df_movies, 'Year', title='Fig5: Distribution of all the films genre by Year')

In [None]:
boxenplot_by_genre(french_movies, 'Year', title='Fig6: Distribution of French films genre by Year', color='g')


In [None]:
boxenplot_by_genre(french_movies, 'Av_Rating', title='Fig7.1: Distribution of French films genre by Average rating', color='y')

In [None]:
boxenplot_by_genre(non_french_movies, 'Av_Rating', title='Fig7.2: Distribution of non French films genre by Average rating', color='r')

Seeking correlation between Runtime and Average rating with Bokeh. Films are coloured by French and non French.

In [None]:
from turtle import color
from bokeh.models import HoverTool
from bokeh.plotting import figure, output_notebook, show

tools = "hover,save,pan,box_zoom,reset,wheel_zoom,help"
p = figure(plot_width=1500, plot_height=800, tools=tools, title= "Fig8: Correlation between Runtime and Rating")
p.scatter("Runtime", "Av_Rating", source=french_movies, color="orange", legend_label='French Movies')
p.scatter("Runtime", "Av_Rating", source=non_french_movies, color="blue", legend_label='Non-French movies')
p.xaxis.axis_label = 'Runtime'
p.yaxis.axis_label = 'Rating'
p.legend.location = "top_right"

hover = p.select(dict(type=HoverTool))
hover.tooltips = [
    ("Title", "@{Title}"),
    ("Runtime", "@{Runtime}"),
    ("Rating", "@{Av_Rating}"),
]

show(p)