In [None]:
# Attempting to Split out original uncleaned Primary D/S but FIRST REDUCING the Hydra-Movies dataset TO 1000 ROWS

In [2]:
import numpy as np
import pandas as pd
import chardet
import scipy.stats as stats

# Load the dataset
file_path = 'Hydra-Movie-Scrape.csv'

# Read file in binary to detect the encoding
with open(file_path, 'rb') as file:
    raw_data = file.read(100000)  # Read only the first 100,000 bytes or we risk exceeding data rate limit

# Detect the encoding used
result = chardet.detect(raw_data)
encoding = result['encoding']

# Load the CSV file and apply the detected encoding
df = pd.read_csv(file_path, encoding=encoding)

# Examine the dataset to identify columns with missing values and outliers.
# Display the first few rows of the pandas dataframe
print(df.head(15))

                                                Title  Year  \
0                         Patton Oswalt: Annihilation  2017   
1                                       New York Doll  2005   
2   Mickey's Magical Christmas: Snowed in at the H...  2001   
3                          Mickey's House of Villains  2001   
4                                       And Then I Go  2017   
5                            An Extremely Goofy Movie  2000   
6                                        Peter Rabbit  2018   
7                                          Love Songs  2007   
8                                                  89  2017   
9                                      The Foster Boy  2011   
10                                    Forever My Girl  2018   
11                            Tom Segura: Disgraceful  2018   
12      The Secret Rules of Modern Living: Algorithms  2015   
13                                Secrets in the Fall  2015   
14                                       Silent Night  

In [3]:
print('\nThe dataset shape before splitting:')
df.shape


The dataset shape before splitting:


(3940, 13)

In [4]:
#Reduce Dataset down to 1000 rows:

# Step 1: Sample 1000 random rows from the original dataset
df_sampled = df.sample(n=1000, random_state=42)  # random_state ensures reproducibility

print('\nThe dataset shape after randomly reducing:')
print(df_sampled.shape)



The dataset shape after randomly reducing:
(1000, 13)


**Perform Data Cleaning on the Reduced Hydra-Movie dataset(1000 rows)**

In [5]:
# Display info about the dataframe
df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 3575 to 879
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title            1000 non-null   object 
 1   Year             1000 non-null   int64  
 2   Summary          998 non-null    object 
 3   Short Summary    1000 non-null   object 
 4   Genres           1000 non-null   object 
 5   IMDB ID          1000 non-null   object 
 6   Runtime          1000 non-null   int64  
 7   YouTube Trailer  985 non-null    object 
 8   Rating           1000 non-null   float64
 9   Movie Poster     1000 non-null   object 
 10  Director         1000 non-null   object 
 11  Writers          995 non-null    object 
 12  Cast             991 non-null    object 
dtypes: float64(1), int64(2), object(10)
memory usage: 109.4+ KB


**Handle Data Inconsistencies**

Normalize the field "IMDB ID" to remove leading "tt" or "tt0" chars so that primary dataset will match same field in secondary dataset

In [6]:
# Normalize the field "IMDB ID"

#to remove leading "tt" or "tt0" chars so that primary dataset will match same field 
# in secondary dataset

# Remove leading "tt" or "tt0" from IMDB ID
df_sampled['IMDB ID'] = df_sampled['IMDB ID'].str.replace(r'^tt0?', '', regex=True)

# Verify the update to IMDB ID
print(df_sampled[['IMDB ID']].head())

      IMDB ID
3575   466875
733   1422020
196   2321405
869   1741273
2717  1591479


**Handle Missing Data**

In [7]:
# Examing missing values:

# Print missing values percentages:
missing_ratio = df_sampled.isnull().mean()
print("\nPercentage Missing Values:")
print(missing_ratio)


Percentage Missing Values:
Title              0.000
Year               0.000
Summary            0.002
Short Summary      0.000
Genres             0.000
IMDB ID            0.000
Runtime            0.000
YouTube Trailer    0.015
Rating             0.000
Movie Poster       0.000
Director           0.000
Writers            0.005
Cast               0.009
dtype: float64


> <ins>Note:</ins> Since there are virutally no Nan's in the dataset we will move on.

**Handle Duplicates**

In [8]:
# Check if duplicates exist

# Identify duplicate rows
duplicates = df_sampled.duplicated()

# Count the number of duplicates
num_duplicates = duplicates.sum()

# Display the number of duplicates
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


> <ins>Note:</ins> Since there are no duplicates, we will move on.

**Handle Outliers**

In [9]:
# Set the global display format for floating-point numbers
pd.options.display.float_format = '{:.2f}'.format

# Examine the distributions
print("\nData Distributions:")
print(df_sampled.describe())
print("\nObjects-Categorical Variables:")
print(df_sampled.describe(include=['object','string']))


Data Distributions:
         Year  Runtime  Rating
count 1000.00  1000.00 1000.00
mean  2012.08   100.96    6.60
std      4.87    30.43    0.88
min   2000.00     0.00    1.70
25%   2009.00    92.00    6.10
50%   2013.00   102.00    6.60
75%   2016.00   115.00    7.20
max   2018.00   338.00    9.00

Objects-Categorical Variables:
                   Title                                            Summary  \
count               1000                                                998   
unique              1000                                                998   
top     Left in Darkness  A young woman, whose mother died giving birth ...   
freq                   1                                                  1   

                                            Short Summary Genres IMDB ID  \
count                                                1000   1000    1000   
unique                                               1000    384    1000   
top     A young woman, whose mother died giv

> <ins>Note:</ins> Descriptive Stats shows us that the Min value for Runtime is 0 which should not be the case. Further
investigation shows 193 records where Runtime = 0 :

In [10]:
# Outlier Work

# Ensure the 'Runtime' column is numeric
df_sampled['Runtime'] = pd.to_numeric(df_sampled['Runtime'], errors='coerce')

# Calculate the number of records where Runtime is 0
num_runtime_zero = (df_sampled['Runtime'] == 0).sum()

# Display the number of records with Runtime = 0
print(f'The number of records where Runtime is 0: {num_runtime_zero}')

The number of records where Runtime is 0: 50


**Use Imputation to update those records with Average Runtime instead:**

In [11]:
# Since Runtime is an int, calculate the average runtime of non-zero values and round to the nearest integer
average_runtime_non_zero = round(df_sampled[df_sampled['Runtime'] != 0]['Runtime'].mean())

# Display the rounded average runtime
print(f'The rounded average runtime of all non-zero movies is: {average_runtime_non_zero} minutes')

The rounded average runtime of all non-zero movies is: 106 minutes


In [12]:
# Then we Impute records with zero Runtime values with the average Runtime value:
df_sampled.loc[df_sampled['Runtime'] == 0, 'Runtime'] = average_runtime_non_zero

# Verify the update
num_runtime_zero_after_imputation = (df_sampled['Runtime'] == 0).sum()

# Print verification results
print(f'The number of records where Runtime is 0 after imputation: {num_runtime_zero_after_imputation}')
print(f'The average runtime of all movies is now: {df_sampled["Runtime"].mean()} minutes')

The number of records where Runtime is 0 after imputation: 0
The average runtime of all movies is now: 106.256 minutes


In [13]:
# Lastly, we force the imputed values to go into Runtime as int:
df_sampled['Runtime'] = df_sampled['Runtime'].astype(int)

**Expand/Explode Genre and Cast into the sampled dataset**

Here we expand the dataframe creating new rows for the movies, one new row for each genre and cast member combination.

In [14]:
# Step 1: Expand the `Genres` and `Cast` columns in the sampled data

# Split and explode the Genres column
df_sampled['Genres'] = df_sampled['Genres'].str.split('|')
df_expanded = df_sampled.explode('Genres')

# Split and explode the Cast column
df_expanded['Cast'] = df_expanded['Cast'].str.split('|')
df_expanded = df_expanded.explode('Cast')

# Display the shape of the expanded dataframe
print('\nThe reduced dataset shape splitting:')
print(df_expanded.shape)

# Continue with further analysis (e.g., one-hot encoding, predictive modeling)


The reduced dataset shape splitting:
(8571, 13)


In [20]:
df_expanded

Unnamed: 0,Title,Year,Summary,Short Summary,Genres,IMDB ID,Runtime,YouTube Trailer,Rating,Movie Poster,Director,Writers,Cast
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",Horror,466875,88,eABWfWuSYpc,5.00,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,Philip Daay,David Anders
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",Horror,466875,88,eABWfWuSYpc,5.00,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,Philip Daay,Jessica Stroup
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",Horror,466875,88,eABWfWuSYpc,5.00,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,Philip Daay,Monica Keena
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",Thriller,466875,88,eABWfWuSYpc,5.00,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,Philip Daay,David Anders
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",Thriller,466875,88,eABWfWuSYpc,5.00,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,Philip Daay,Jessica Stroup
...,...,...,...,...,...,...,...,...,...,...,...,...,...
879,Steve Jobs,2015,His passion and ingenuity have been the drivin...,Steve Jobs takes us behind the scenes of the d...,Drama,2080374,122,aEr6K1bwIVs,7.20,https://hydramovies.com/wp-content/uploads/201...,Danny Boyle,Aaron Sorkin,Michael Fassbender
879,Steve Jobs,2015,His passion and ingenuity have been the drivin...,Steve Jobs takes us behind the scenes of the d...,Drama,2080374,122,aEr6K1bwIVs,7.20,https://hydramovies.com/wp-content/uploads/201...,Danny Boyle,Aaron Sorkin,Seth Rogen
879,Steve Jobs,2015,His passion and ingenuity have been the drivin...,Steve Jobs takes us behind the scenes of the d...,History,2080374,122,aEr6K1bwIVs,7.20,https://hydramovies.com/wp-content/uploads/201...,Danny Boyle,Aaron Sorkin,Kate Winslet
879,Steve Jobs,2015,His passion and ingenuity have been the drivin...,Steve Jobs takes us behind the scenes of the d...,History,2080374,122,aEr6K1bwIVs,7.20,https://hydramovies.com/wp-content/uploads/201...,Danny Boyle,Aaron Sorkin,Michael Fassbender


**Save an output CSV before one-hot encoding process, to simplify the visualizations for the EDA process.**

In [19]:
# Write the cleaned data to an output CSV file with UTF-8 BOM encoding 

output_file_path = 'Cleaned-Reduced-Movie-Primary_forEDA.csv'

# Display info about the dataframe
df_expanded.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f'The cleaned data (for EDA process) has been written to {output_file_path}')

The cleaned data (for EDA process) has been written to Cleaned-Reduced-Movie-Primary_forEDA.csv


**Apply One-Hot Encoding to the reduced and exploded Movies dataset to allow for further predictive analysis**

In [15]:
# Apply one-hot encoding to the 'Genres' and 'Cast' columns => tag the various exploded Genre and Cast columns with 
df_encoded = pd.get_dummies(df_expanded, columns=['Genres', 'Cast'], prefix=['Genre', 'Cast'])

# Display the first few rows of the transformed dataframe
df_encoded.head()


Unnamed: 0,Title,Year,Summary,Short Summary,IMDB ID,Runtime,YouTube Trailer,Rating,Movie Poster,Director,...,Cast_Zack Gold,Cast_Zaira Wasim,Cast_Zane Austin,Cast_Zita Hanrot,Cast_Zoe Kazan,Cast_Zoe Saldana,Cast_Zooey Deschanel,Cast_Zoë Kravitz,Cast_Zuleikha Robinson,Cast_Élodie Bouchez
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",466875,88,eABWfWuSYpc,5.0,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,...,False,False,False,False,False,False,False,False,False,False
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",466875,88,eABWfWuSYpc,5.0,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,...,False,False,False,False,False,False,False,False,False,False
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",466875,88,eABWfWuSYpc,5.0,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,...,False,False,False,False,False,False,False,False,False,False
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",466875,88,eABWfWuSYpc,5.0,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,...,False,False,False,False,False,False,False,False,False,False
3575,Left in Darkness,2006,"A young woman, whose mother died giving birth ...","A young woman, whose mother died giving birth ...",466875,88,eABWfWuSYpc,5.0,https://hydramovies.com/wp-content/uploads/201...,Steven R. Monroe,...,False,False,False,False,False,False,False,False,False,False


In [16]:
print('\nThe reduced dataset shape after splitting and one-hot encoding:')
df_encoded.shape


The reduced dataset shape after splitting and one-hot encoding:


(8571, 1794)

> <ins>Note:</ins> As we can see, one-hot encoding has vastly increased the number of columns. 

In [17]:
# Display info about the dataframe
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8571 entries, 3575 to 879
Columns: 1794 entries, Title to Cast_Élodie Bouchez
dtypes: bool(1783), float64(1), int32(1), int64(1), object(8)
memory usage: 15.3+ MB


**Write cleaned data to an output CSV file:**

In [18]:
# Write the cleaned data to an output CSV file with UTF-8 BOM encoding

output_file_path = 'Cleaned-Reduced-Movie-Primary.csv'

# Display info about the dataframe
df_encoded.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f'The cleaned data has been written to {output_file_path}')

The cleaned data has been written to Cleaned-Reduced-Movie-Primary.csv


**Our Target Variable is Movie Rating (column: Rating).**