In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np
import missingno as msno
import seaborn as sns
import scipy.stats as stats
import warnings # For handling error messages.
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
movie_data = pd.read_csv('data/movie_data_combined.csv')
movie_data['director_id'] = movie_data['director_id'].astype('category')
movie_data = movie_data.loc[movie_data.runtime > 60]

In [None]:
movie_data.columns

In [None]:
movie_data.drop(columns=['cast', 'release_date', 'vote_count', 'original_language', 'popularity'], inplace=True)

In [None]:
movie_data.genre_ids.unique()

In [None]:
len(list(movie_data.director_id.unique()))

In [None]:
movie_data.dtypes

In [None]:
# Find number of unique directors
print(f"Number of unique numbers in the 'director_id' column: {movie_data['director_id'].nunique()}")

In [None]:
# Convert string representations of lists to actual lists of integers
movie_data['genre_ids'] = movie_data['genre_ids'].apply(lambda x: ast.literal_eval(x))

# Flatten the list of lists into a single list
flattened_list = [number for sublist in movie_data['genre_ids'] for number in sublist]

# Convert the flattened list into a set to find unique numbers
unique_genres = set(flattened_list)

# Count the number of unique elements
unique_count = len(unique_genres)

print(f"Number of unique numbers in the 'genre' column: {unique_count}")

In [None]:
unique_genres

In [None]:
movie_data.groupby('year').size()

In [None]:
null_values = movie_data.groupby('year').agg(lambda x: x.isna().sum())
null_values_series = null_values[['budget', 'revenue']].T
null_values_series

In [None]:
null_values_series.loc['budget']

In [None]:
null_values_series.loc['budget'].plot(x='year',
                        y='budget',
                        kind='line')

null_values_series.loc['revenue'].plot(x='year',
                        y='revenue',
                        kind='line',
                        title='Missing values for budget/revenue')

plt.legend(['Budget', 'Revenue'])
plt.show()

In [None]:
kaggle_movies = pd.read_csv('data/kaggle_movies.csv')

In [None]:
kaggle_movies.head()

In [None]:
movie_data.dtypes

In [None]:
kaggle_movies.dtypes

In [None]:
kaggle_movies = kaggle_movies.rename(columns={'name': 'title', 'budget': 'kag_budget', 'gross': 'kag_revenue'})

In [None]:
kaggle_merge = kaggle_movies[['title','year','kag_budget','kag_revenue']]
kaggle_merge

In [None]:
movie_data = movie_data.merge(kaggle_merge, how='left', on=['title', 'year'])

In [None]:
movie_data

In [None]:
movie_data.isna().sum()

In [None]:
movie_data.loc[movie_data['budget'].isna() & ~movie_data['kag_budget'].isna(), 'budget'] = movie_data.loc[movie_data['budget'].isna() & ~movie_data['kag_budget'].isna(), 'kag_budget']
movie_data.loc[movie_data['revenue'].isna() & ~movie_data['kag_revenue'].isna(), 'revenue'] = movie_data.loc[movie_data['revenue'].isna() & ~movie_data['kag_revenue'].isna(), 'kag_revenue']

In [None]:
movie_data.drop(columns=['kag_budget', 'kag_revenue'], inplace=True)

In [None]:
movie_data.isna().sum()

Filled in a few missing values, but not many.

In [None]:
movie_data

In [None]:
genre_dict = {'Action':28, 'Adventure':12, 'Animation':16, 'Comedy':35, 'Crime':80, 'Documentary':99,
              'Drama':18, 'Family':10751, 'Fantasy':14, 'History':36, 'Horror':27, 'Music':10402,
              'Mystery':9648, 'Romance':10749, 'Science_Fiction':878, 'TV_Movie':10770, 'Thriller':53,
              'War':10752, 'Western':37}

In [None]:
movie_data.genre_ids[0]

In [None]:
id_to_genre = {v: k for k, v in genre_dict.items()}

# Add new columns for each genre, initializing with 0
for genre in genre_dict.keys():
    movie_data[genre] = 0

# Iterate through the dataframe and set genre columns
for index, row in movie_data.iterrows():
    for genre_id in row['genre_ids']:
        genre_name = id_to_genre.get(genre_id)
        if genre_name:
            movie_data.at[index, genre_name] = 1

In [None]:
movie_data.shape

In [None]:
movie_data = movie_data.loc[movie_data.TV_Movie != 1]
movie_data.drop(columns='TV_Movie')

In [None]:
movie_data.to_csv('../data/filtered_data.csv')
movie_data

In [None]:
average_rating = np.mean(movie_data.vote_average)

In [None]:
_ = plt.hist(movie_data.vote_average, bins=20)
_ = plt.axvline(x=average_rating, color='r', linestyle='--')

In [None]:
genre_col = {key: i + 10 for i, key in enumerate(genre_dict.keys())}
genre_col

In [None]:
filtered_dfs = {}
for genre, col_index in genre_col.items():
    filtered_dfs[genre] = movie_data[movie_data.iloc[:, col_index] == 1]

In [None]:
for key in filtered_dfs.keys():
    print(key, filtered_dfs[key].vote_average.mean())

In [None]:
movie_col = movie_data[['vote_average', 'year', 'month', 'budget',
       'revenue', 'runtime', 'Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science_Fiction',
       'Thriller', 'War', 'Western']]
correlation = movie_col.corr()
correlation

In [None]:
_ = sns.heatmap(correlation)

In [None]:
correlation.vote_average.sort_values(ascending=False)

### Are dramas more popular?

In [None]:
drama_rating = movie_data.loc[movie_data.Drama == 1].vote_average
not_drama_rating = movie_data.loc[movie_data.Drama != 1].vote_average

In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n

    return x, y

In [None]:
# Compute ECDF for drama data: x_drama, y_drama
x_drama, y_drama = ecdf(drama_rating)
x_not_drama, y_not_drama = ecdf(not_drama_rating)

# Generate plot
_ = plt.plot(x_drama, y_drama, marker='.', linestyle='none')
_ = plt.plot(x_not_drama, y_not_drama, marker='.', linestyle='none')

# Label the axes
_ = plt.xlabel('x value?')
_ = plt.ylabel('ECDF')
_ = plt.legend(['Drama','Not Drama'])
_ = plt.title('Empirical Cumulative Distribution Function: Drama films')

# Display the plot
plt.show()

#### Visually, it seems clear that dramatic films are rated more highly than other films.
We can also demonstrate this through null hypothesis significance testing.

Null hypothesis: There is no difference in ratings beween dramas and non-dramatic films.

In [None]:
all_rating = np.concatenate((drama_rating, not_drama_rating))
mean_replicates = []

for i in range(10000):
    rand_rating = np.random.permutation(all_rating)
    rand_drama = rand_rating[:len(drama_rating)]
    # rand_not_drama = rand_rating[len(drama_rating):]
    mean_replicates.append(np.mean(rand_drama))

In [None]:
np.mean(mean_replicates)

In [None]:
drama_mean = np.mean(drama_rating)
drama_mean

In [None]:
_ = sns.histplot(mean_replicates)
_ = plt.axvline(x=drama_mean, color='r', linestyle='--')

In [None]:
mean_replicates_array = np.array(mean_replicates)
count_greater_than_drama_mean = np.sum(mean_replicates_array > drama_mean)

print(count_greater_than_drama_mean)

After taking 10000 permutations of the data, none of the replicates had average ratings anywhere close to the average rating of drama films. We have therefore shown visually and computationally that drama films are rated more highly than the average of all films in the dataset, with a very small p-value.