In [3]:
# Import necessary libraries and classes
import pandas as pd
from dataPreprocessing.categorical_encoder import CategoricalEncoder
from dataPreprocessing.data_type_converter import DataTypeConverter
from dataPreprocessing.datetime_handler import DateTimeHandler
from dataPreprocessing.feature_engineer import FeatureEngineer
from dataPreprocessing.missing_value_handler import MissingValueHandler
from dataPreprocessing.outlier_handler import OutlierHandler
from dataPreprocessing.scaler import Scaler
from dataPreprocessing.text_cleaner import TextCleaner


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alper\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# Load the dataset
file_path = r'C:\Users\Alper\OneDrive\Masaüstü\synthetic_sample_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Initial Data:")
print(data.head())



Initial Data:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0          44  TOYHANVUR       Horror   07/07/1985  9.750176   
1          47  CORCILSLF  Documentary   05/08/1999  7.912876   
2          64  UBWUVIHEL    Adventure   08/05/1996  3.116053   
3          67  CGHMHKJHH      Fantasy   21/07/2018  8.811783   
4          67  WMLWHTAMN      Fantasy   28/10/2011  5.250991   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD  Awards  Popular  
0   1.035408e+07       2        1  
1   5.722105e+06       1        1  
2   5.601372e+07       0        0  
3   4.422451

In [19]:
# Handling Missing Values
# Fill missing values in the 'Rating' column with the mean
data = MissingValueHandler.fill_with_mean(data, 'Rating')
print("\nData after handling missing values (mean):")
print(data.head())

# Fill missing values in the 'Rating' column with the median
data = MissingValueHandler.fill_with_median(data, 'Rating')
print("\nData after handling missing values (median):")
print(data.head())

# Fill missing values in the 'Rating' column with the mode
data = MissingValueHandler.fill_with_mode(data, 'Rating')
print("\nData after handling missing values (mode):")
print(data.head())

# Drop rows with missing values in the 'Rating' column
data = MissingValueHandler.drop_missing(data, 'Rating')
print("\nData after dropping missing values:")
print(data.head())



Data after handling missing values (mean):
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0          44  TOYHANVUR       Horror   07/07/1985  9.750176   
1          47  CORCILSLF  Documentary   05/08/1999  7.912876   
2          64  UBWUVIHEL    Adventure   08/05/1996  3.116053   
3          67  CGHMHKJHH      Fantasy   21/07/2018  8.811783   
4          67  WMLWHTAMN      Fantasy   28/10/2011  5.250991   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD  Awards  Popular  
0   1.035408e+07       2        1  
1   5.722105e+06       1        1  
2   5.601372e+07  

In [20]:
# Handling Outliers
# Handle outliers in the dataset using the IQR method
outlier_handler = OutlierHandler(method="iqr", threshold=1.5)
data = outlier_handler.fit_transform(data)
print("\nData after handling outliers (IQR method):")
print(data.head())



Data after handling outliers (IQR method):
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0          44  TOYHANVUR       Horror   07/07/1985  9.750176   
1          47  CORCILSLF  Documentary   05/08/1999  7.912876   
2          64  UBWUVIHEL    Adventure   08/05/1996  4.749800   
3          67  CGHMHKJHH      Fantasy   21/07/2018  8.811783   
4          67  WMLWHTAMN      Fantasy   28/10/2011  5.250991   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD  Awards  Popular  
0   1.035408e+07       2        0  
1   5.722105e+06       1        0  
2   5.601372e+07  

In [21]:
# Data Scaling
# Standardize the dataset
scaler = Scaler(method="standard")
data = scaler.fit_transform(data)
print("\nData after scaling (standard):")
print(data.head())

# Min-Max scaling of the dataset
scaler = Scaler(method="minmax")
data = scaler.fit_transform(data)
print("\nData after scaling (min-max):")
print(data.head())



Data after scaling (standard):
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0   -0.208069  TOYHANVUR       Horror   07/07/1985  1.024764   
1   -0.106241  CORCILSLF  Documentary   05/08/1999 -0.138680   
2    0.470787  UBWUVIHEL    Adventure   08/05/1996 -2.141655   
3    0.572615  CGHMHKJHH      Fantasy   21/07/2018  0.430540   
4    0.572615  WMLWHTAMN      Fantasy   28/10/2011 -1.824283   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD    Awards  Popular  
0      -1.258172  1.160412      0.0  
1      -1.415478 -0.055953      0.0  
2       0.292471 -1.2723

In [22]:
# Text Cleaning
# Initialize the TextCleaner
text_cleaner = TextCleaner(remove_stopwords=True, lemmatize=True)

# Clean the 'Summary' column
data['Summary'] = data['Summary'].apply(text_cleaner.clean)
print("\nData after text cleaning:")
print(data[['Summary']].head())



Data after text cleaning:
                                             Summary
0  group college student get bargained spend week...
1  documentary shed light devastating effect clim...
2  lost civilization rediscovered deep within ama...
3  young inventor build time machine embarks jour...
4  young inventor build time machine embarks jour...


In [9]:
### Categorical Encoding ###
# Label encode the 'Genre' column
data, le = CategoricalEncoder.label_encode(data, 'Genre')
print("\nData after label encoding 'Genre':")
print(data.head())



Data after label encoding 'Genre':
   Unnamed: 0   Movie Id  Genre Release Date    Rating  \
0   -0.208069  TOYHANVUR     11   07/07/1985  1.024764   
1   -0.106241  CORCILSLF      5   05/08/1999 -0.138680   
2    0.470787  UBWUVIHEL      1   08/05/1996 -2.141655   
3    0.572615  CGHMHKJHH      8   21/07/2018  0.430540   
4    0.572615  WMLWHTAMN      8   28/10/2011 -1.824283   

                                             Summary Shooting Location  \
0  group college student get bargained spend week...          New York   
1  documentary shed light devastating effect clim...             Paris   
2  lost civilization rediscovered deep within ama...            Sydney   
3  young inventor build time machine embarks jour...             Paris   
4  young inventor build time machine embarks jour...            Sydney   

   Budget in USD    Awards  Popular  
0      -1.258172  1.160412      0.0  
1      -1.415478 -0.055953      0.0  
2       0.292471 -1.272318      0.0  
3      -1.459616 -

In [10]:
# One-hot encode the 'Shooting Location' column
data, ohe = CategoricalEncoder.one_hot_encode(data, 'Shooting Location')
print("\nData after one-hot encoding 'Shooting Location':")
print(data.head())



Data after one-hot encoding 'Shooting Location':
   Unnamed: 0   Movie Id  Genre Release Date    Rating  \
0   -0.208069  TOYHANVUR     11   07/07/1985  1.024764   
1   -0.106241  CORCILSLF      5   05/08/1999 -0.138680   
2    0.470787  UBWUVIHEL      1   08/05/1996 -2.141655   
3    0.572615  CGHMHKJHH      8   21/07/2018  0.430540   
4    0.572615  WMLWHTAMN      8   28/10/2011 -1.824283   

                                             Summary  Budget in USD    Awards  \
0  group college student get bargained spend week...      -1.258172  1.160412   
1  documentary shed light devastating effect clim...      -1.415478 -0.055953   
2  lost civilization rediscovered deep within ama...       0.292471 -1.272318   
3  young inventor build time machine embarks jour...      -1.459616 -0.055953   
4  young inventor build time machine embarks jour...       2.340832 -1.272318   

   Popular  Shooting Location_London  Shooting Location_Los Angeles  \
0      0.0                       0.0       

In [23]:
# Date and Time Manipulation
# Convert 'Release Date' to datetime
data = DateTimeHandler.convert_to_datetime(data, 'Release Date', format='%d/%m/%Y')
print("\nData after date conversion:")
print(data.head())

# Extract the year from 'Release Date'
data = DateTimeHandler.extract_date_component(data, 'Release Date', 'year')
print("\nData after extracting year from 'Release Date':")
print(data.head())

# Extract the month from 'Release Date'
data = DateTimeHandler.extract_date_component(data, 'Release Date', 'month')
print("\nData after extracting month from 'Release Date':")
print(data.head())

# Extract the day from 'Release Date'
data = DateTimeHandler.extract_date_component(data, 'Release Date', 'day')
print("\nData after extracting day from 'Release Date':")
print(data.head())



Data after date conversion:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0        0.44  TOYHANVUR       Horror   1985-07-07  0.955072   
1        0.47  CORCILSLF  Documentary   1999-08-05  0.604148   
2        0.64  UBWUVIHEL    Adventure   1996-05-08  0.000000   
3        0.67  CGHMHKJHH      Fantasy   2018-07-21  0.775839   
4        0.67  WMLWHTAMN      Fantasy   2011-10-28  0.095728   

                                             Summary Shooting Location  \
0  group college student get bargained spend week...          New York   
1  documentary shed light devastating effect clim...             Paris   
2  lost civilization rediscovered deep within ama...            Sydney   
3  young inventor build time machine embarks jour...             Paris   
4  young inventor build time machine embarks jour...            Sydney   

   Budget in USD  Awards  Popular  
0       0.071021     1.0      0.0  
1       0.039214     0.5      0.0  
2       0.384560     0.0      0.0

In [12]:
# Feature Engineering
# Add a new feature that is the difference between 'Budget in USD' and 'Awards'
data = FeatureEngineer.add_difference(data, 'Budget in USD', 'Awards', 'Budget_Awards_Diff')
print("\nData after adding 'Budget_Awards_Diff':")
print(data.head())

# Add a new feature that is the product of 'Rating' and 'Popular'
data = FeatureEngineer.add_product(data, 'Rating', 'Popular', 'Rating_Times_Popular')
print("\nData after adding 'Rating_Times_Popular':")
print(data.head())

# Add a new feature that is the sum of 'Rating' and 'Awards'
data = FeatureEngineer.add_sum(data, 'Rating', 'Awards', 'Rating_Plus_Awards')
print("\nData after adding 'Rating_Plus_Awards':")
print(data.head())

# Add a new feature that is the square of 'Rating'
data = FeatureEngineer.add_square(data, 'Rating', 'Rating_Squared')
print("\nData after adding 'Rating_Squared':")
print(data.head())



Data after feature engineering:
   Unnamed: 0   Movie Id  Genre Release Date    Rating  \
0   -0.208069  TOYHANVUR     11   1985-07-07  1.024764   
1   -0.106241  CORCILSLF      5   1999-08-05 -0.138680   
2    0.470787  UBWUVIHEL      1   1996-05-08 -2.141655   
3    0.572615  CGHMHKJHH      8   2018-07-21  0.430540   
4    0.572615  WMLWHTAMN      8   2011-10-28 -1.824283   

                                             Summary  Budget in USD    Awards  \
0  group college student get bargained spend week...      -1.258172  1.160412   
1  documentary shed light devastating effect clim...      -1.415478 -0.055953   
2  lost civilization rediscovered deep within ama...       0.292471 -1.272318   
3  young inventor build time machine embarks jour...      -1.459616 -0.055953   
4  young inventor build time machine embarks jour...       2.340832 -1.272318   

   Popular  Shooting Location_London  Shooting Location_Los Angeles  \
0      0.0                       0.0                        

In [13]:
# Data Type Conversion
# Convert the 'Rating' column to numeric
data = DataTypeConverter.to_numeric(data, 'Rating')
print("\nData after converting 'Rating' to numeric:")
print(data.head())

# Convert the 'Genre' column to categorical
data = DataTypeConverter.to_categorical(data, 'Genre')
print("\nData after converting 'Genre' to categorical:")
print(data.head())



Data after data type conversion:
   Unnamed: 0   Movie Id Genre Release Date    Rating  \
0   -0.208069  TOYHANVUR    11   1985-07-07  1.024764   
1   -0.106241  CORCILSLF     5   1999-08-05 -0.138680   
2    0.470787  UBWUVIHEL     1   1996-05-08 -2.141655   
3    0.572615  CGHMHKJHH     8   2018-07-21  0.430540   
4    0.572615  WMLWHTAMN     8   2011-10-28 -1.824283   

                                             Summary  Budget in USD    Awards  \
0  group college student get bargained spend week...      -1.258172  1.160412   
1  documentary shed light devastating effect clim...      -1.415478 -0.055953   
2  lost civilization rediscovered deep within ama...       0.292471 -1.272318   
3  young inventor build time machine embarks jour...      -1.459616 -0.055953   
4  young inventor build time machine embarks jour...       2.340832 -1.272318   

   Popular  Shooting Location_London  Shooting Location_Los Angeles  \
0      0.0                       0.0                            0

In [14]:
# Display the final processed dataset
print("\nFinal Processed Data:")
print(data.head())



Final Processed Data:
   Unnamed: 0   Movie Id Genre Release Date    Rating  \
0   -0.208069  TOYHANVUR    11   1985-07-07  1.024764   
1   -0.106241  CORCILSLF     5   1999-08-05 -0.138680   
2    0.470787  UBWUVIHEL     1   1996-05-08 -2.141655   
3    0.572615  CGHMHKJHH     8   2018-07-21  0.430540   
4    0.572615  WMLWHTAMN     8   2011-10-28 -1.824283   

                                             Summary  Budget in USD    Awards  \
0  group college student get bargained spend week...      -1.258172  1.160412   
1  documentary shed light devastating effect clim...      -1.415478 -0.055953   
2  lost civilization rediscovered deep within ama...       0.292471 -1.272318   
3  young inventor build time machine embarks jour...      -1.459616 -0.055953   
4  young inventor build time machine embarks jour...       2.340832 -1.272318   

   Popular  Shooting Location_London  Shooting Location_Los Angeles  \
0      0.0                       0.0                            0.0   
1    