# Data Filtering and Selection

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import zscore
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.append('../utils')
import functions

In [None]:
movie_df = pd.read_csv('../data/local/clean/films_19to24.csv')
movie_df.head()

### Remove Rows with Irrelevant information
- Rows with missing or incomplete data
- Rows with outliers or extreme values
- Rows with zero or negative revenue and budget
- Rows with zero or extremely low ratings
- Duplicates
- Rows with low votes
- 

In [None]:
cleaned_df = movie_df.copy()

In [None]:
# # Step 1: Remove rows with critical missing values (imdb_rating, imdb_votes, imdb_id)
# initial_row_count = len(df)
# cleaned_df = df.dropna(subset=['imdb_rating', 'imdb_votes', 'imdb_id'])
# rows_removed_step_1 = initial_row_count - len(cleaned_df)
# print(f'Rows removed after dropping rows with missing critical values: {rows_removed_step_1}')

In [None]:
# # Step 2: Remove rows with extreme z-scores for budget, revenue, or popularity
# initial_row_count = len(cleaned_df)
# cleaned_df = cleaned_df[(cleaned_df['budget_zscore'].abs() < 3) & 
#                         (cleaned_df['revenue_zscore'].abs() < 3) &
#                         (cleaned_df['popularity_zscore'].abs() < 3)]
# rows_removed_step_2 = initial_row_count - len(cleaned_df)
# print(f'Rows removed after filtering extreme z-scores: {rows_removed_step_2}')

In [None]:
# # Step 3: Remove rows with zero or negative revenue and budget
# initial_row_count = len(cleaned_df)
# cleaned_df = cleaned_df[(cleaned_df['revenue'] > 0) & (cleaned_df['budget'] > 0)]
# rows_removed_step_3 = initial_row_count - len(cleaned_df)
# print(f'Rows removed after filtering zero or negative revenue/budget: {rows_removed_step_3}')

In [None]:
# remove rows with runtime under 40 minutes
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df[cleaned_df['runtime'] > 40]
rows_removed_step_4 = initial_row_count - len(cleaned_df)
print(f'Rows removed after filtering runtime under 40 minutes: {rows_removed_step_4}')

In [None]:
# remove rows with low ratings (below a threshold of 1)
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df[cleaned_df['tmdb_rating'] > 1]
cleaned_df = cleaned_df[cleaned_df['imdb_rating'] > 1]
rows_removed_step_5 = initial_row_count - len(cleaned_df)
print(f'Rows removed after filtering low ratings: {rows_removed_step_5}')

In [None]:
# remove duplicates
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df.drop_duplicates(subset=['tmdb_id', 'imdb_id', 'title'])
rows_removed_step_6 = initial_row_count - len(cleaned_df)
print(f'Rows removed after dropping duplicates: {rows_removed_step_6}')

In [None]:
# remove rows with very few votes (below a threshold of 10)
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df[(cleaned_df['tmdb_votes'] >= 10) & (cleaned_df['imdb_votes'] >= 10)]
rows_removed_step_7 = initial_row_count - len(cleaned_df)
print(f'Rows removed after filtering low vote count: {rows_removed_step_7}')

In [None]:
# cleaned dataset
final_row_count = len(cleaned_df)
print(f'Final number of rows in the cleaned dataset: {final_row_count}')

In [None]:
# round 'imdb_votes' column to integers
cleaned_df['imdb_votes'] = cleaned_df['imdb_votes'].round(0).astype(int)

In [None]:
cleaned_df.head()

#### Create sample df
- Filter out columns where 'revenue', 'tmdb_votes', 'imdb_votes' and 'budget' are not 0

In [None]:
movie_sample_df = cleaned_df[(cleaned_df['revenue'] != 0) &
                         (cleaned_df['tmdb_votes'] != 0) &
                         (cleaned_df['imdb_votes'] != 0) &
                         (cleaned_df['budget'] != 0)]

display(movie_sample_df)

Create .csv

In [None]:
movie_sample_df.to_csv('../data/local/clean/movie_sample.csv', index=False)

### Data Overview

#### Descriptive Stats

In [None]:
# Basic statistics for numerical columns
print("Descriptive Statistics:")
print(cleaned_df.describe())

# Get information on data types and non-null counts
print("\nData Types and Non-null Counts:")
print(cleaned_df.info())

# Check for unique values in categorical columns
print("\nUnique Values in Categorical Columns:")
print(cleaned_df[['genres', 'director', 'language', 'release_year']].nunique())


#### Check missing values

In [None]:
# Check for missing values
print("\nMissing Values in Columns:")
print(cleaned_df.isnull().sum())


Language distribution

In [None]:
# Calculate counts and percentage distribution of each language
language_counts = cleaned_df['language'].value_counts()
language_percentages = cleaned_df['language'].value_counts(normalize=True) * 100

# Combine counts and percentages into a single DataFrame
language_stats = pd.DataFrame({
    'Language': language_counts.index,
    'Count': language_counts.values,
    'Percentage': language_percentages.values
})

# Add a numerated column
language_stats.insert(0, 'Rank', range(1, len(language_stats) + 1))

# Display the DataFrame
print(language_stats.to_string(index=False))


#### Visualize Distributions

In [None]:
# Boxplots for better understanding of the distributions
plt.figure(figsize=(12, 8))
sns.boxplot(data=cleaned_df[['budget', 'revenue', 'popularity', 'runtime', 'tmdb_rating', 'imdb_rating']])
plt.title('Boxplot of Numerical Columns')
plt.show()

In [None]:
# Define the column names and their respective custom labels
columns = ['budget', 'revenue', 'popularity', 'runtime', 'tmdb_rating', 'imdb_rating']
custom_labels = [
    'Budget',
    'Revenue',
    'Popularity',
    'Runtime',
    'TMDB Rating',
    'IMDB Rating'
]

# Create the histograms
fig, axes = plt.subplots(2, 3, figsize=(20, 10))  # Adjust the layout if needed
axes = axes.flatten()  # Flatten the 2D array of axes for easier iteration

for col, label, ax in zip(columns, custom_labels, axes):
    cleaned_df[col].hist(ax=ax, bins=30, color='gold')
    ax.set_title(label)  # Set custom title for each subplot

plt.tight_layout()
plt.show()

#### Trends Over Time

In [None]:
# Group data by release year and calculate mean values
yearly_trends = cleaned_df.groupby('release_year').agg({
    'budget': 'mean',
    'revenue': 'mean',
    'popularity': 'mean',
    'tmdb_rating': 'mean',
    'imdb_rating': 'mean',
}).reset_index()

# Plot trends over time
plt.figure(figsize=(12, 8))
plt.plot(yearly_trends['release_year'], yearly_trends['budget'], label='Budget', marker='o', color='darkcyan')
plt.plot(yearly_trends['release_year'], yearly_trends['revenue'], label='Revenue', marker='o', color='crimson')
plt.plot(yearly_trends['release_year'], yearly_trends['tmdb_rating'], label='TMDB Rating', marker='o', color='gold')
plt.plot(yearly_trends['release_year'], yearly_trends['imdb_rating'], label='IMDb Rating', marker='o', color='darkslateblue')
plt.xlabel('Year')
plt.ylabel('Value')
plt.title("Trends' Average Over Time (by Year)")
plt.legend()
plt.show()


In [None]:
# Group data by release year and calculate mean values
yearly_trends = cleaned_df.groupby('release_year').agg({
    'budget': 'mean',
    'revenue': 'mean',
    'popularity': 'mean',
    'tmdb_rating': 'mean',
    'imdb_rating': 'mean',
    'runtime': 'mean'
}).reset_index()

# Plot trends over time
plt.figure(figsize=(12, 8))
plt.plot(yearly_trends['release_year'], yearly_trends['popularity'], label='Popularity', marker='o', color='indianred')
plt.plot(yearly_trends['release_year'], yearly_trends['tmdb_rating'], label='TMDB Rating', marker='o', color='gold')
plt.plot(yearly_trends['release_year'], yearly_trends['imdb_rating'], label='IMDb Rating', marker='o', color='darkslateblue')
plt.xlabel('Year')
plt.ylabel('Value')
plt.title("Trends' Average Over Time (by Year)")
plt.legend()
plt.show()


#### Popularity and Revenue Relationship

In [None]:
# Scatter plot for popularity vs revenue
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cleaned_df, x='popularity', y='revenue', hue='release_year', palette='viridis')
plt.title('Popularity vs Revenue')
plt.xlabel('Popularity')
plt.ylabel('Revenue')
plt.show()

# Scatter plot for popularity vs budget
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cleaned_df, x='popularity', y='budget', hue='release_year', palette='viridis')
plt.title('Popularity vs Budget')
plt.xlabel('Popularity')
plt.ylabel('Budget')
plt.show()


#### Rating Distribution by Genre

In [None]:
# # Take a 10% sample of the DataFrame for easier visualization
# df_sample = cleaned_df.sample(frac=0.1, random_state=42)

# # Split genres into separate rows for analysis
# df_genre = df_sample[['genres', 'tmdb_rating', 'imdb_rating']].explode('genres')

# # Plot TMDB Rating Distribution by Genre
# plt.figure(figsize=(12, 8))
# sns.boxplot(x='genres', y='tmdb_rating', data=df_genre)
# plt.title('TMDB Rating Distribution by Genre (Sampled Data)')
# plt.xticks(rotation=90)
# plt.show()

# # Plot IMDb Rating Distribution by Genre
# plt.figure(figsize=(12, 8))
# sns.boxplot(x='genres', y='imdb_rating', data=df_genre)
# plt.title('IMDb Rating Distribution by Genre (Sampled Data)')
# plt.xticks(rotation=90)
# plt.show()

#### Analyze Directors

In [None]:
# Group data by director and calculate the mean for key variables
director_stats = cleaned_df.groupby('director').agg({
    'budget': 'mean',
    'revenue': 'mean',
    'popularity': 'mean',
    'tmdb_rating': 'mean',
    'imdb_rating': 'mean',
    'runtime': 'mean'
}).reset_index()

# Top directors by average revenue
top_directors = director_stats.sort_values(by='revenue', ascending=False).head(10)
print("\nTop 10 Directors by Average Revenue:")
print(top_directors[['director', 'revenue']])

# Plot the top 10 directors by average revenue
plt.figure(figsize=(12, 8))
sns.barplot(x='revenue', y='director', data=top_directors, color='lightseagreen')
plt.title('Top 10 Directors by Average Revenue')
plt.xlabel('Average Revenue')
plt.ylabel('Director')
plt.show()

#### Sentiment Analysis

In [None]:
# Define a function to calculate sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply the sentiment function to the 'title' or 'clean_title'
cleaned_df['sentiment'] = cleaned_df['title'].apply(get_sentiment)

# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(cleaned_df['sentiment'], bins=30, kde=True)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.show()

Votes vs Languages

In [None]:
# Aggregate votes by language for both TMDB and IMDb
language_votes = cleaned_df.groupby('language')[['tmdb_votes', 'imdb_votes']].sum()

# Calculate the number of films per language and convert to DataFrame
language_counts = cleaned_df['language'].value_counts().to_frame(name='language_count')

# Merge the language_counts with the vote data
language_stats = language_votes.merge(language_counts, left_index=True, right_index=True)

# Calculate correlation between the votes and the number of films per language
correlation_tmdb = language_stats['tmdb_votes'].corr(language_stats['language_count'])
correlation_imdb = language_stats['imdb_votes'].corr(language_stats['language_count'])

# Print the correlations
print(f'Correlation between TMDB votes and number of films per language: {correlation_tmdb:.2f}')
print(f'Correlation between IMDb votes and number of films per language: {correlation_imdb:.2f}')


In [None]:
# Visualize the distribution of TMDB votes by language (top 15 for readability)
top_languages_tmdb = language_stats['tmdb_votes'].sort_values(ascending=False).head(15)
top_languages_tmdb.plot(kind='bar', figsize=(12, 8), color='cadetblue')
plt.title('Total TMDB Votes by Language (Top 15)')
plt.xlabel('Language')
plt.ylabel('Total TMDB Votes')
plt.xticks(rotation=45)
plt.show()

In [None]:


# Visualize the distribution of IMDb votes by language (top 15 for readability)
top_languages_imdb = language_stats['imdb_votes'].sort_values(ascending=False).head(15)
top_languages_imdb.plot(kind='bar', figsize=(12, 8), color='salmon')
plt.title('Total IMDb Votes by Language (Top 15)')
plt.xlabel('Language')
plt.ylabel('Total IMDb Votes')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Scatter plot of TMDB votes vs. number of films per language
plt.figure(figsize=(10, 6))
sns.scatterplot(x=language_stats['language_count'], y=language_stats['tmdb_votes'])
plt.title('TMDB Votes vs. Number of Films per Language')
plt.xlabel('Number of Films per Language')
plt.ylabel('Total TMDB Votes')
plt.show()

In [None]:
# Scatter plot of IMDb votes vs. number of films per language
plt.figure(figsize=(10, 6))
sns.scatterplot(x=language_stats['language_count'], y=language_stats['imdb_votes'])
plt.title('IMDb Votes vs. Number of Films per Language')
plt.xlabel('Number of Films per Language')
plt.ylabel('Total IMDb Votes')
plt.show()

Same without English

In [None]:
# Exclude English language
df_non_english = cleaned_df[cleaned_df['language'] != 'English']

# Aggregate votes by language for both TMDB and IMDb (excluding English)
language_votes = df_non_english.groupby('language')[['tmdb_votes', 'imdb_votes']].sum()

# Calculate the number of films per language (excluding English) and convert to DataFrame
language_counts = df_non_english['language'].value_counts().to_frame(name='language_count')

# Merge the language_counts with the vote data
language_stats = language_votes.merge(language_counts, left_index=True, right_index=True)

# Calculate correlation between the votes and the number of films per language
correlation_tmdb = language_stats['tmdb_votes'].corr(language_stats['language_count'])
correlation_imdb = language_stats['imdb_votes'].corr(language_stats['language_count'])

# Print the correlations
print(f'Correlation between TMDB votes and number of films per language (excluding English): {correlation_tmdb:.2f}')
print(f'Correlation between IMDb votes and number of films per language (excluding English): {correlation_imdb:.2f}')

# Visualize the distribution of TMDB votes by language (top 15 for readability)
top_languages_tmdb = language_stats['tmdb_votes'].sort_values(ascending=False).head(15)
top_languages_tmdb.plot(kind='bar', figsize=(12, 8), color='cadetblue')
plt.title('Total TMDB Votes by Language (Top 15) - Excluding English')
plt.xlabel('Language')
plt.ylabel('Total TMDB Votes')
plt.xticks(rotation=45)
plt.show()

# Visualize the distribution of IMDb votes by language (top 15 for readability)
top_languages_imdb = language_stats['imdb_votes'].sort_values(ascending=False).head(15)
top_languages_imdb.plot(kind='bar', figsize=(12, 8), color='salmon')
plt.title('Total IMDb Votes by Language (Top 15) - Excluding English')
plt.xticks(rotation=45)

