In [4]:
# Import necessary libraries and classes
import pandas as pd
from dataPreprocessing.categorical_encoder import CategoricalEncoder
from dataPreprocessing.data_type_converter import DataTypeConverter
from dataPreprocessing.datetime_handler import DateTimeHandler
from dataPreprocessing.feature_engineer import FeatureEngineer
from dataPreprocessing.missing_value_handler import MissingValueHandler
from dataPreprocessing.outlier_handler import OutlierHandler
from dataPreprocessing.scaler import Scaler
from dataPreprocessing.text_cleaner import TextCleaner

# Load the dataset
file_path = r'C:\Users\Alper\OneDrive\Masaüstü\synthetic_sample_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Initial Data:")
print(data.head())

### Handling Missing Values ###
# Fill missing values in the 'Rating' column with the mean
data = MissingValueHandler.fill_with_mean(data, 'Rating')
print("\nData after handling missing values:")
print(data.head())

### Handling Outliers ###
# Handle outliers in the dataset using the IQR method
data = OutlierHandler(method="iqr", threshold=1.5).fit_transform(data)
print("\nData after handling outliers:")
print(data.head())

### Data Scaling ###
# Standardize the dataset
data = Scaler(method="standard").fit_transform(data)
print("\nData after scaling:")
print(data.head())

### Text Cleaning ###
# Initialize the TextCleaner
text_cleaner = TextCleaner(remove_stopwords=True, lemmatize=True)
# Clean the 'Summary' column
data['Summary'] = data['Summary'].apply(text_cleaner.clean)
print("\nData after text cleaning:")
print(data[['Summary']].head())

### Categorical Encoding ###
# Label encode the 'Genre' column
data, le = CategoricalEncoder.label_encode(data, 'Genre')
print("\nData after label encoding 'Genre':")
print(data.head())

# One-hot encode the 'Shooting Location' column
data, ohe = CategoricalEncoder.one_hot_encode(data, 'Shooting Location')
print("\nData after one-hot encoding 'Shooting Location':")
print(data.head())

### Date and Time Manipulation ###
# Convert 'Release Date' to datetime
data = DateTimeHandler.convert_to_datetime(data, 'Release Date', format='%d/%m/%Y')
# Extract the year from 'Release Date'
data = DateTimeHandler.extract_date_component(data, 'Release Date', 'year')
print("\nData after date conversion and extraction:")
print(data.head())

### Feature Engineering ###
# Add a new feature that is the difference between 'Budget in USD' and 'Awards'
data = FeatureEngineer.add_difference(data, 'Budget in USD', 'Awards', 'Budget_Awards_Diff')
# Add a new feature that is the product of 'Rating' and 'Popular'
data = FeatureEngineer.add_product(data, 'Rating', 'Popular', 'Rating_Times_Popular')
print("\nData after feature engineering:")
print(data.head())

### Data Type Conversion ###
# Convert the 'Rating' column to numeric
data = DataTypeConverter.to_numeric(data, 'Rating')
# Convert the 'Genre' column to categorical
data = DataTypeConverter.to_categorical(data, 'Genre')
print("\nData after data type conversion:")
print(data.head())

# Display the final processed dataset
print("\nFinal Processed Data:")
print(data.head())


Initial Data:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0          44  TOYHANVUR       Horror   07/07/1985  9.750176   
1          47  CORCILSLF  Documentary   05/08/1999  7.912876   
2          64  UBWUVIHEL    Adventure   08/05/1996  3.116053   
3          67  CGHMHKJHH      Fantasy   21/07/2018  8.811783   
4          67  WMLWHTAMN      Fantasy   28/10/2011  5.250991   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD  Awards  Popular  
0   1.035408e+07       2        1  
1   5.722105e+06       1        1  
2   5.601372e+07       0        0  
3   4.422451