In [1]:
import pandas as pd
import re
import ast
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing

Import the data

In [2]:
movies = pd.read_csv('../../data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
characters = pd.read_csv('../../data//MovieSummaries/character.metadata.tsv', sep = '\t', header = None)

# Importing txt files
name_clusters = pd.read_csv('../../data//MovieSummaries/name.clusters.txt', sep = '\t', header = None)
summaries = pd.read_csv('../../data//MovieSummaries/plot_summaries.txt', sep = '\t', header = None)
tv_tropes = pd.read_csv('../../data//MovieSummaries/tvtropes.clusters.txt', sep = '\t', header = None)

We inspect the content of each dataframe

In [None]:
movies.sample(5)

In [None]:
characters.sample(5)

In [None]:
name_clusters.sample(5)

In [None]:
summaries.sample(5)

In [None]:
tv_tropes.sample(5)

We see that the data files do not contain the column names. We add them according to the column described in the README.

In [8]:
movies.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']
name_clusters.columns = ['name', 'freebase_movie_id']
summaries.columns = ['wikipedia_movie_id', 'plot_summary']
characters.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_birth', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'freebase_character_map', 'freebase_character_id', 'freebase_actor_id']


For the movies dataframe, we notice that for the languages, genres and countries we have dictionaries with freebase ids. We uniformize it into lists.

In [None]:
# list of unique entries in column countries
movies['movie_countries'].unique()

In [10]:
# Function to extract from a dict to a list
def extract_dict_to_list(entry):
    # Convert the string representation of the dictionary to an actual dictionary
    entry_dict = ast.literal_eval(entry)
    # Extract the country names (values) from the dictionary
    return list(entry_dict.values())

movies['movie_countries'] = movies['movie_countries'].apply(extract_dict_to_list)
movies['movie_genres'] = movies['movie_genres'].apply(extract_dict_to_list)
movies['movie_languages'] = movies['movie_languages'].apply(extract_dict_to_list)

In [None]:
movies.sample(5)

In [None]:
# We check for duplicates
print("number of duplicated according to Wikipedia id : ", movies['wikipedia_movie_id'].duplicated().sum())
print("number of duplicated according to freebase movie id : ", movies['freebase_movie_id'].duplicated().sum())
print("number of duplicated according to title of the movie : ", movies['movie_name'].duplicated().sum())

# We investigate further to see why some movies have the same name
movies[movies['movie_name'].duplicated(keep=False)].sort_values('movie_name')

# We see that the movies with the same name have not the same release date, so we can keep them as they are. They are likely representing different version of the same movie.

We convert the dates into a unique format : datetime format. Also we noticed previously that some date have only the year. We decide to keep only the year as we will only perform yearly analysis.

In [None]:
# Function to extract the release year from the date
def extract_release_year(date_str):
    try:
        # Attempt to extract the year from the 'YYYY-MM-DD' format
        return pd.to_datetime(date_str).year
    except (ValueError, TypeError):
        try:
            # Attempt to extract the year from 'YYYY' format
            return int(date_str)
        except ValueError:
            return None  # Return None for invalid or missing dates

movies['movie_release_date'] = movies['movie_release_date'].apply(extract_release_year)
characters['actor_birth'] = characters['actor_birth'].apply(extract_release_year)

In [None]:
movies.movie_release_date.describe()

# Data Exploration

We see few NaNs in the dataframes.

In [None]:
print((movies.isna().sum() / movies.shape[0]) * 100)

# in movie_language, movie_genre and movie_countries, we check the percentage of empty lists
print("Percentage of empty languages: ", (movies['movie_languages'].apply(len) == 0).sum() / movies.shape[0] * 100)
print("Percentage of empty genres: ", (movies['movie_genres'].apply(len) == 0).sum() / movies.shape[0] * 100)
print("Percentage of empty countries: ", (movies['movie_countries'].apply(len) == 0).sum() / movies.shape[0] * 100)

We see that we have a considerable amount of missing values. The most shocking column is the box office revenue. Indeed, since almost 90% of this data is missing, it is hard to impute this feature, and keeping it in the dataset might add noise.  

We still impute values for the movie runtime, and release year.

In [None]:
# Create a figure with subplots (2x2 layout)
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot distribution of movie runtime
sns.histplot(movies['movie_runtime'].dropna(), kde=True, ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Distribution of Movie Runtime')
axes[0, 0].set_xlabel('Runtime (minutes)')

# Boxplot for movie runtime
sns.boxplot(x=movies['movie_runtime'].dropna(), ax=axes[0, 1], color='blue')
axes[0, 1].set_title('Boxplot of Movie Runtime')

# Plot distribution of release year
sns.histplot(movies['movie_release_date'].dropna(), kde=True, ax=axes[1, 0], color='green')
axes[1, 0].set_title('Distribution of Movie Release Year')
axes[1, 0].set_xlabel('Release Year')

# Boxplot for movie release year
sns.boxplot(x=movies['movie_release_date'].dropna(), ax=axes[1, 1], color='green')
axes[1, 1].set_title('Boxplot of Movie Release Year')

# Adjust layout for better appearance
plt.tight_layout()

# Show the plots
plt.show()