# Beginning of Roger's Code

In [None]:
# Importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Alfred Hitchcock once said: “The length of a film should be directly related to the endurance of the human bladder.”

Loading in and renaming imperative datasets

In [None]:
imdb_title_basics_df = pd.read_csv('..\zippedData/imdb.title.basics.csv.gz')
imdb_title = imdb_title_basics_df
rt_movie_info_df = pd.read_csv('..\zippedData/rt.movie_info.tsv.gz', delimiter='\t')
rt_movie = rt_movie_info_df
imdb_title.info() , rt_movie.info()

Since IMDB has none non-null items in the columns that are objects and not int's or floats lets check for duplicates.

Seeing that the delta from 75th percentile to 100th percentile is 6,893.5 values shows that there may be multiple occurances of just a handful of runtimes. 

In [None]:
imdb_title['primary_title'].value_counts(), imdb_title['original_title'].value_counts(), imdb_title['tconst'].value_counts(), imdb_title['genres'].value_counts()

We see here that within the 'primary_title' and 'original_title" there are duplicate names. Want to clean that up. Not necessary to touch genres since genre value is applied as a description of the film rather than an interfering value.

In [None]:
# Removing/cleaning the duplicates within original_title and primary_columns 
imdb_title2 = imdb_title.drop_duplicates(subset=['original_title'], keep = 'first')
imdb_title2 = imdb_title.drop_duplicates(subset=['primary_title'], keep = 'first')
imdb_title2['primary_title'].value_counts(), imdb_title2['original_title'].value_counts(), imdb_title2['runtime_minutes'].value_counts()

We see here nearly all duplicates have been removed

Lets see now how the value counts of runtimes is dispirsed 

In [None]:
q_4_Imdb = imdb_title2["runtime_minutes"].value_counts().quantile(1)
q_3_Imdb = imdb_title2["runtime_minutes"].value_counts().quantile(.75)
q_2_Imdb = imdb_title2["runtime_minutes"].value_counts().quantile(.5)
q_1_Imdb = imdb_title2["runtime_minutes"].value_counts().quantile(.25)

q_1_Imdb , q_2_Imdb , q_3_Imdb, q_4_Imdb

Seeing that the delta from 75th percentile to 100th percentile is 6,404 values shows that there may be multiple occurances of just a handful of runtimes.

In [None]:
# Remove columns that will no longer be used as well as remove null values to remaining columns
imdb_title2.dropna(subset = ['start_year'],axis = 0,  inplace = True)
imdb_title2.dropna(subset = ['runtime_minutes'], axis = 0,  inplace = True)
imdb_title2.drop(['primary_title'], axis =1 , inplace = True)
imdb_title2.drop(['original_title'], axis = 1,  inplace = True)
imdb_title2.drop(['genres'], axis = 1,  inplace = True)

imdb_title2.info()

In [None]:
# Check for the most common runtime to appear in the dataset
most_common = imdb_title2["runtime_minutes"].value_counts().index[0]
most_common

In [None]:
#Show where the longer times are moving more towards from the most common number
G_90_Imdb = imdb_title2.loc[(imdb_title2['runtime_minutes'] > 90), ['start_year']].mean()
B_90_Imdb = imdb_title2.loc[(imdb_title2['runtime_minutes'] < 90), ['start_year']].mean()
Is_90_Imdb = imdb_title2.loc[(imdb_title2['runtime_minutes'] == 90), ['start_year']].mean()
G_90_Imdb, B_90_Imdb, Is_90_Imdb 
# Shows newer movies are tending to be longer than 90 minutes with showing the average year 
# pushing to be higher when looking at longer runtimes when looking that the mean from the
# most common value and up

To get a visual on the dataset runtime values distribution we will plot the top 70 runtime values

In [None]:
IMDB_Runtime_Numbers = imdb_title2['runtime_minutes'].value_counts().nlargest(70)

IMDB_Runtime_Numbers_Index = imdb_title2['runtime_minutes'].value_counts().index[0:70]
fig_IMBD, ax =plt.subplots(figsize = (6,5))
ax.bar(IMDB_Runtime_Numbers_Index ,IMDB_Runtime_Numbers , color = 'grey' )
ax.set_facecolor('white')
ax.set_ylabel("Frequency Of Run Time Numbers")
ax.set_title("Most Common Movie Lengths")
ax.set_xlabel("Run Times in Minutes")


The graph shows how often the common value of 90 minutes is used

We can move onto the Rotten Tomatoes dataset to see runtimes from movies throughout history

In [None]:
# testing the type of value that is within the runtime column
type(rt_movie['runtime'][0])

We see above the values in the columns are strings and must be changed into floats to be worked on

In [None]:
# testing the type of value that is within the column
type(rt_movie['runtime'][0])

In [None]:
rt_movie['runtime_minutes']=rt_movie['runtime'].str.split()
rt_movie['runtime_minutes']=rt_movie['runtime_minutes'].str[0]
rt_movie['runtime_minutes'] = rt_movie['runtime_minutes'].astype(float)
# change runtime from a string to a solo float number

No column has just the year so we can grab from the theater_date

In [None]:
# Grabbing just the year from theater date
rt_movie['start_year']=rt_movie['theater_date'].str.split()
rt_movie['start_year']=rt_movie['start_year'].str[2]
rt_movie['start_year'] = (rt_movie['start_year']).astype(float)


In [None]:
rt_movie.dropna(axis = 0, subset = ['start_year'], inplace = True)
rt_movie.dropna(axis = 0, subset = ['runtime_minutes'], inplace = True)
# Removing any null values in the columns 'Year' and 'running_mins'

In [None]:
# Seeing if the standard feature time of 90 is more towards current movies 
rt_movie.loc[(rt_movie['runtime_minutes'] ==90), ['start_year']].mean()
# Average year being 1987 can possibly show

In [None]:
# Dropping unnecessary columns
rt_movie.drop(['runtime'], axis =1, inplace =True)
rt_movie.drop(['studio'],axis = 1, inplace = True)
rt_movie.drop(['currency'],axis = 1, inplace = True)
rt_movie.drop(['writer'],axis = 1, inplace = True)

In [None]:
#checking normal runtimes for newer movies
print(rt_movie.loc[(rt_movie['start_year'] >=2015),['runtime_minutes']].mean())
print(rt_movie.loc[(rt_movie['start_year'] >=2015),['runtime_minutes']].max())
print(rt_movie.loc[(rt_movie['start_year'] >=2015),['runtime_minutes']].min())
# rt_movie.loc[(rt_movie['Year'] >=2015),['running_mins']]
#Ave 16 mins longer than mode of data set
# Max length 148 min
# Min legnth 80
#Showing More Modern Movies are longer than the feature length Standard
# Modern movies being greater in this case being 2015 and present 

In [None]:
#function to do what was above, just grabbing the float number from the string
def string_to_float(df, series):
    df[series] = df[series].str.split()
    df[series] = df[series].str[0]
    df[series] = df[series].astype(float)
    return df[series]
    

In [None]:
q_4_rt = rt_movie["runtime_minutes"].value_counts().quantile(1)
q_3_rt = rt_movie["runtime_minutes"].value_counts().quantile(.75)
q_2_rt = rt_movie["runtime_minutes"].value_counts().quantile(.5)
q_1_rt = rt_movie["runtime_minutes"].value_counts().quantile(.25)
q_1_rt , q_2_rt, q_3_rt, q_4_rt
# shows broad range and may want to ignore anything below q_3

In [None]:
Top_70_RT_Indexes = rt_movie['runtime_minutes'].value_counts().index[:70]
Top_70_RT = rt_movie['runtime_minutes'].value_counts().nlargest(70)
#grabbbing 70 most common runtimes from rotten tomatoes file 

Lets plot a chart the get a visual on runtime with a datset that has a longer time history

In [None]:
fig_rt , ax = plt.subplots(figsize = (8,6))
ax.bar(Top_70_RT_Indexes , Top_70_RT, color = 'grey' )
ax.set_ylabel("Frequency Of Runtime Numbers")
ax.set_xlabel("Run Times in Minutes")
ax.set_title('Most Common Movie Lengths')
ax.set_facecolor('white')
ax.legend(['As Per Rotten Tomatoes'])
# plot showing top 70 most common runtimes


Still seeing the heavy grouping around the common90 minute runtime number

In [None]:
# Gathering Values within Rotten Tomatoes File Where only looking at frequncy runtime within the past 10 years
# just like the IMDB dataset
P10_Years_RT = rt_movie.loc[(rt_movie['start_year'] >=2011),['runtime_minutes']].value_counts().nlargest(50)
ten = P10_Years_RT.index[0:50]
# way to turn multi-index into float values for the x column
P10_Index = ten.get_level_values(0).astype(float)

In [None]:
# Graph shoiwing most frequent runtimes of the past 10 years rotten tomatoes graph
fig_Rotten_Modern , ax= plt.subplots(figsize = (8,6))
ax.bar(P10_Index, P10_Years_RT)
ax.set_xlabel('Runtimes in Minutes ')
ax.set_ylabel('Frequency')
ax.set_title('Past 10 Years')
ax.legend(['As Per Rotten Tomatoes'])

Lets get a far better visual on how runtimes have been either increasing or decreasing over time

In [None]:
# top most 5 or 10 common of each decade put in a list or dictionary key, year ,value most common runtime put in line chart?
mode_1 = rt_movie.loc[rt_movie['start_year'] < 1941, ['runtime_minutes']].mode()
mode_2 = rt_movie.loc[(rt_movie['start_year'] >= 1941) & (rt_movie['start_year'] <1961), ['runtime_minutes']].mode()
mode_3 = rt_movie.loc[(rt_movie['start_year'] >= 1961) & (rt_movie['start_year'] <1981), ['runtime_minutes']].mode()
mode_4 = rt_movie.loc[(rt_movie['start_year'] >= 1981) & (rt_movie['start_year'] <2001), ['runtime_minutes']].mode()
mode_5 = rt_movie.loc[(rt_movie['start_year'] >= 2001) & (rt_movie['start_year'] <2011), ['runtime_minutes']].mode()
mode_6 = rt_movie.loc[(rt_movie['start_year'] >= 2011) , ['runtime_minutes']].mode()
mode_7 = rt_movie.loc[(rt_movie['start_year'] == 2018) , ['runtime_minutes']].mode()
median_1 = rt_movie.loc[rt_movie['start_year'] < 1941, ['runtime_minutes']].median()
median_2 = rt_movie.loc[(rt_movie['start_year'] >= 1941) & (rt_movie['start_year'] <1961), ['runtime_minutes']].median()
median_3 = rt_movie.loc[(rt_movie['start_year'] >= 1961) & (rt_movie['start_year'] <1981), ['runtime_minutes']].median()
median_4 = rt_movie.loc[(rt_movie['start_year'] >= 1981) & (rt_movie['start_year'] <2001), ['runtime_minutes']].median()
median_5 = rt_movie.loc[(rt_movie['start_year'] >= 2001) & (rt_movie['start_year'] <2011), ['runtime_minutes']].median()
median_6 = rt_movie.loc[(rt_movie['start_year'] >= 2011) , ['runtime_minutes']].median()
median_7 = rt_movie.loc[(rt_movie['start_year'] == 2018) , ['runtime_minutes']].median()
mean_1 = rt_movie.loc[rt_movie['start_year'] < 1941, ['runtime_minutes']].mean()
mean_2 = rt_movie.loc[(rt_movie['start_year'] >= 1941) & (rt_movie['start_year'] <1961), ['runtime_minutes']].mean()
mean_3 = rt_movie.loc[(rt_movie['start_year'] >= 1961) & (rt_movie['start_year'] <1981), ['runtime_minutes']].mean()
mean_4 = rt_movie.loc[(rt_movie['start_year'] >= 1981) & (rt_movie['start_year'] <2001), ['runtime_minutes']].mean()
mean_5 = rt_movie.loc[(rt_movie['start_year'] >= 2001) & (rt_movie['start_year'] <2011), ['runtime_minutes']].mean()
mean_6 = rt_movie.loc[(rt_movie['start_year'] >= 2011) , ['runtime_minutes']].mean()
mean_7 = rt_movie.loc[(rt_movie['start_year'] == 2018) , ['runtime_minutes']].mean()
print(mode_1, mode_2, mode_3, mode_4, mode_5, mode_6, mode_7)
print(mean_1, mean_2, mean_3, mean_4, mean_5, mean_6, mean_7)
print(median_1, median_2, median_3, median_4, median_5, median_6, median_7)
#modes being in order   80, 90 , 95, 95, 102, 100, 105 with mode_5 and mode_7 having multiple modes therefore took the average of them
#medians being 94, 98, 104, 103, 102, 104, 107
#means 100, 102, 110, 107, 104, 107, 111

In [None]:
mode_values = [80, 90, 95, 95, 102, 100, 105]
Time = ['1921', '1941', '1961', '1981', '2001', '2011', '2018' ]
median_values = [94,98,104,103,102,104,107]
mean_values = [100, 102, 110, 107, 104, 107, 111]
sns.set_theme(style="white")
sns.lineplot(Time, mode_values)
sns.set_theme(style="white")
sns.lineplot(Time, median_values)
sns.lineplot(Time, mean_values)
plt.xlabel('Timeline')
plt.legend(['Mode of Movie Length', 'Median of Time Points' , 'Mean of Time Points'])
plt.ylabel('Run Times')
plt.title('The Gradual Increase Of Movie Runtimes Through History ')
#Line graph to show non-technical viewer that trend for movie lengths is getting longer since 1921

As we have gotten down here at the last graph it shows how overtime the runtimes are going longer than the industry standard. With the industry standard being implemented back in the 1920's to make sure movies weren't overbearing to the public. But now that movies are getting longer and straying from the 90 minute mark it is safe to say that following the trend of breaking the industry standard is a safe bet because it can allow the studio to not be constricted to this 90 minute mark and possibly lmiting what can be put in the film.