# Data Filtering and Selection
- Read existing dfs
- Perform basic inspection, show missing information
- Generate sample datasets for further analysis

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append('../utils')
import functions

In [31]:
# movie_df = pd.read_csv('../data/local/clean/films_19to24.csv')
# movie_df.head()

In [None]:
movie_df = pd.read_csv('../data/local/clean/films_before19.csv')
display(movie_df)

### Remove Rows with Irrelevant information
- Rows with missing or incomplete data
- Rows with outliers or extreme values
- Rows with zero or negative revenue and budget
- Rows with zero or extremely low ratings
- Duplicates
- Rows with low votes
- 

In [33]:
cleaned_df = movie_df.copy()

In [None]:
# remove rows with runtime under 40 minutes
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df[cleaned_df['runtime'] > 40]
rows_removed_step_4 = initial_row_count - len(cleaned_df)
print(f'Rows removed after filtering runtime under 40 minutes: {rows_removed_step_4}')

In [None]:
# remove rows with low ratings (below a threshold of 1)
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df[cleaned_df['tmdb_rating'] > 1]
cleaned_df = cleaned_df[cleaned_df['imdb_rating'] > 1]
rows_removed_step_5 = initial_row_count - len(cleaned_df)
print(f'Rows removed after filtering low ratings: {rows_removed_step_5}')

In [None]:
# remove duplicates
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df.drop_duplicates(subset=['tmdb_id', 'imdb_id', 'title'])
rows_removed_step_6 = initial_row_count - len(cleaned_df)
print(f'Rows removed after dropping duplicates: {rows_removed_step_6}')

In [None]:
# remove rows with very few votes (below a threshold of 10)
initial_row_count = len(cleaned_df)
cleaned_df = cleaned_df[(cleaned_df['tmdb_votes'] >= 10) & (cleaned_df['imdb_votes'] >= 10)]
rows_removed_step_7 = initial_row_count - len(cleaned_df)
print(f'Rows removed after filtering low vote count: {rows_removed_step_7}')

In [None]:
# cleaned dataset
final_row_count = len(cleaned_df)
print(f'Final number of rows in the cleaned dataset: {final_row_count}')

In [42]:
# round 'imdb_votes' column to integers
cleaned_df['imdb_votes'] = cleaned_df['imdb_votes'].round(0).astype(int)

In [None]:
display(cleaned_df)

#### Create sample df
- Filter out columns where 'revenue', 'tmdb_votes', 'imdb_votes' and 'budget' are not 0

In [None]:
movie_sample_df = cleaned_df[(cleaned_df['revenue'] != 0) &
                         (cleaned_df['tmdb_votes'] != 0) &
                         (cleaned_df['imdb_votes'] != 0) &
                         (cleaned_df['budget'] != 0)]

display(movie_sample_df)

Create .csv

In [45]:
# movie_sample_df.to_csv('../data/local/clean/movie_sample.csv', index=False)

### Data Overview

#### Descriptive Stats

In [None]:
print('Descriptive Statistics (numerical columns):')
print(cleaned_df.describe())

print('\nData Types and Non-null Counts:')
print(cleaned_df.info())

print('\nUnique Values in (categorical columns):')
print(cleaned_df[['genres', 'director', 'language', 'release_year']].nunique())

#### Check missing values

In [None]:
print('\nMissing Values in Columns:')
print(cleaned_df.isnull().sum())