In [4]:
import pandas as pd
from AutoDetectColumnTypes import auto_detect_column_types
from MissingValueHandler import impute_mean, impute_median, impute_constant, delete_missing
from OutlierHandler import remove_outliers_iqr
from Scaler import standardize, normalize
from TextCleaner import remove_stopwords, to_lowercase, remove_punctuation, lemmatize, clean_text_columns
from FeatureEngineer import create_interaction_feature, create_polynomial_feature
from DataTypeConverter import to_numeric, to_categorical
from CategoricalEncoder import one_hot_encode, label_encode
from DatetimeHandler import to_datetime, extract_date_part
from FileWriter import write_to_file_xls, write_to_file_csv

In [5]:
# Load the data
file_path = 'C:\\Users\\Asmz\\OneDrive\\Desktop\\uni\\Python\\propject\\synthetic_sample_data.csv'
df = pd.read_csv(file_path)
df_a = pd.read_csv(file_path)


In [11]:
# Test auto_detect_column_types method
print("Auto Detect Column Types:")
column_types = auto_detect_column_types(df)
print(column_types)

Auto Detect Column Types:
{'numeric': ['Unnamed: 0', 'Rating', 'Budget in USD', 'Awards', 'Popular'], 'text': ['Movie Id', 'Genre', 'Release Date', 'Summary', 'Shooting Location'], 'datetime': []}


In [12]:
# Test missing value handler methods
print("\nMissing Value Handler:")
print("Before handling missing values:")
print(df.isnull().sum())

impute_mean(df)
print("After imputing mean:")
print(df.isnull().sum())

df = pd.read_csv(file_path)  # Reload data
impute_median(df)
print("After imputing median:")
print(df.isnull().sum())

df = pd.read_csv(file_path)  # Reload data
impute_constant(df, constant=0)
print("After imputing constant:")
print(df.isnull().sum())

df = pd.read_csv(file_path)  # Reload data
df = delete_missing(df)
print("After deleting missing values:")
print(df.isnull().sum())


Missing Value Handler:
Before handling missing values:
Unnamed: 0            0
Movie Id              0
Genre                 0
Release Date          0
Rating                0
Summary               0
Shooting Location     0
Budget in USD        20
Awards                0
Popular               0
dtype: int64
After imputing mean:
Unnamed: 0           0
Movie Id             0
Genre                0
Release Date         0
Rating               0
Summary              0
Shooting Location    0
Budget in USD        0
Awards               0
Popular              0
dtype: int64
After imputing median:
Unnamed: 0           0
Movie Id             0
Genre                0
Release Date         0
Rating               0
Summary              0
Shooting Location    0
Budget in USD        0
Awards               0
Popular              0
dtype: int64
After imputing constant:
Unnamed: 0           0
Movie Id             0
Genre                0
Release Date         0
Rating               0
Summary              

In [13]:
# Test outlier handler method
print("\nOutlier Handler:")
print("Before removing outliers:")
print(df.describe())

df = remove_outliers_iqr(df)
print("After removing outliers:")
print(df.describe())


Outlier Handler:
Before removing outliers:
       Unnamed: 0      Rating  Budget in USD      Awards     Popular
count  980.000000  980.000000   9.800000e+02  980.000000  980.000000
mean    50.182653    8.045466   4.763700e+07    1.040816    0.238776
std     29.488460    1.767818   3.036190e+07    0.821299    0.426553
min      0.000000    3.023367   1.149800e+04    0.000000    0.000000
25%     25.000000    7.458505   2.196342e+07    0.000000    0.000000
50%     50.000000    8.636286   4.773131e+07    1.000000    0.000000
75%     76.000000    9.276481   7.143330e+07    2.000000    0.000000
max    100.000000    9.985402   1.964558e+08    2.000000    1.000000
After removing outliers:
       Unnamed: 0      Rating  Budget in USD      Awards     Popular
count  886.000000  886.000000   8.860000e+02  886.000000  886.000000
mean    50.533860    8.441283   4.662976e+07    1.138826    0.259594
std     29.340712    1.248742   2.775294e+07    0.792567    0.438659
min      0.000000    4.736549   1.

In [14]:
# Test scaler methods
print("\nScaler:")
print("Before scaling:")
print(df.head())

standardize(df)
print("After standardizing:")
print(df.head())

df = pd.read_csv(file_path)  # Reload data
normalize(df)
print("After normalizing:")
print(df.head())


Scaler:
Before scaling:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0          44  TOYHANVUR       Horror   07/07/1985  9.750176   
1          47  CORCILSLF  Documentary   05/08/1999  7.912876   
3          67  CGHMHKJHH      Fantasy   21/07/2018  8.811783   
4          67  WMLWHTAMN      Fantasy   28/10/2011  5.250991   
5           9  REKCPGLOJ       Comedy   04/06/2013  8.676586   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   
5  A heartwarming story about a dog who brings tw...       Los Angeles   

   Budget in USD  Awards  Popular  
0   1.035408e+07       2        1  
1   5.722105e+06       1        1  
3   4.422451e+06       1        1  
4

In [15]:
# Test text cleaner method
print("\nText Cleaner:")
print("Before cleaning text columns:")
print(df.head())

clean_text_columns(df)
print("After cleaning text columns:")
print(df.head())


Text Cleaner:
Before cleaning text columns:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0        0.44  TOYHANVUR       Horror   07/07/1985  0.966213   
1        0.47  CORCILSLF  Documentary   05/08/1999  0.702310   
2        0.64  UBWUVIHEL    Adventure   08/05/1996  0.013313   
3        0.67  CGHMHKJHH      Fantasy   21/07/2018  0.831426   
4        0.67  WMLWHTAMN      Fantasy   28/10/2011  0.319967   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD  Awards  Popular  
0       0.052649     1.0      1.0  
1       0.029070     0.5      1.0  
2       0.285079 

In [16]:
# Test feature engineer methods
print("\nFeature Engineer:")
print("Before creating interaction feature:")
print(df.head())

create_interaction_feature(df_a, 'Rating', 'Awards',"tyr1")
print("After creating interaction feature:")
print(df.head())

#df_a = pd.read_csv(file_path)  # Reload data
print("Before creating polynomial feature:")
print(df_a.head())

create_polynomial_feature(df_a, 'tyr1', degree=2, new_column_name='polynomial_feature')
print("After creating polynomial feature:")
print(df_a.head())


Feature Engineer:
Before creating interaction feature:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0        0.44  toyhanvur       horror     07071985  0.966213   
1        0.47  corcilslf  documentary     05081999  0.702310   
2        0.64  ubwuvihel    adventure     08051996  0.013313   
3        0.67  cghmhkjhh      fantasy     21072018  0.831426   
4        0.67  wmlwhtamn      fantasy     28102011  0.319967   

                                             Summary Shooting Location  \
0  group college student get bargained spend week...          new york   
1  documentary shed light devastating effect clim...             paris   
2  lost civilization rediscovered deep within ama...            sydney   
3  young inventor build time machine embarks jour...             paris   
4  young inventor build time machine embarks jour...            sydney   

   Budget in USD  Awards  Popular  
0       0.052649     1.0      1.0  
1       0.029070     0.5      1.0  
2     

In [17]:
# Test data type converter methods
print("\nData Type Converter:")
print("Before converting to numeric:")
print(df.dtypes)

to_numeric(df)
print("\nAfter converting to numeric:")
print(df.dtypes)

to_categorical(df)
print("After converting to categorical:")
print(df.dtypes)


Data Type Converter:
Before converting to numeric:
Unnamed: 0           float64
Movie Id              object
Genre                 object
Release Date          object
Rating               float64
Summary               object
Shooting Location     object
Budget in USD        float64
Awards               float64
Popular              float64
dtype: object

After converting to numeric:
Unnamed: 0           float64
Movie Id             float64
Genre                float64
Release Date           int64
Rating               float64
Summary              float64
Shooting Location    float64
Budget in USD        float64
Awards               float64
Popular              float64
dtype: object
After converting to categorical:
Unnamed: 0           category
Movie Id             category
Genre                category
Release Date         category
Rating               category
Summary              category
Shooting Location    category
Budget in USD        category
Awards               category
Popular

In [18]:
# Test categorical encoder methods
print("\nCategorical Encoder:")
df = pd.read_csv(file_path)  # Reload data
print("Before one-hot encoding:")
print(df.head())

one_hot_encode(df)
print("After one-hot encoding:")
print(df.head())

df = pd.read_csv(file_path)  # Reload data
print("Before label encoding:")
print(df.head())

label_encode(df)
print("After label encoding:")
print(df.head())


Categorical Encoder:
Before one-hot encoding:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0          44  TOYHANVUR       Horror   07/07/1985  9.750176   
1          47  CORCILSLF  Documentary   05/08/1999  7.912876   
2          64  UBWUVIHEL    Adventure   08/05/1996  3.116053   
3          67  CGHMHKJHH      Fantasy   21/07/2018  8.811783   
4          67  WMLWHTAMN      Fantasy   28/10/2011  5.250991   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD  Awards  Popular  
0   1.035408e+07       2        1  
1   5.722105e+06       1        1  
2   5.601372e+0

In [19]:
# Test datetime handler methods
print("\nDatetime Handler:")
df = pd.read_csv(file_path)  # Reload data
print("Before converting to datetime:")
print(df.head())

to_datetime(df)
print("After converting to datetime:")
print(df.head())

print("Extracted date parts:")
extract_date_part(df, part='year')
extract_date_part(df, part='month')
extract_date_part(df, part='day')
extract_date_part(df, part='hour')
extract_date_part(df, part='minute')
extract_date_part(df, part='second')
print(df.head())


Datetime Handler:
Before converting to datetime:
   Unnamed: 0   Movie Id        Genre Release Date    Rating  \
0          44  TOYHANVUR       Horror   07/07/1985  9.750176   
1          47  CORCILSLF  Documentary   05/08/1999  7.912876   
2          64  UBWUVIHEL    Adventure   08/05/1996  3.116053   
3          67  CGHMHKJHH      Fantasy   21/07/2018  8.811783   
4          67  WMLWHTAMN      Fantasy   28/10/2011  5.250991   

                                             Summary Shooting Location  \
0  A group of college students get more than they...          New York   
1  A documentary that sheds light on the devastat...             Paris   
2  A lost civilization is rediscovered deep withi...            Sydney   
3  A young inventor builds a time machine and emb...             Paris   
4  A young inventor builds a time machine and emb...            Sydney   

   Budget in USD  Awards  Popular  
0   1.035408e+07       2        1  
1   5.722105e+06       1        1  
2   5.601372

In [20]:
# Test file writer methods
write_to_file_xls(df, fileNumber='1')
write_to_file_csv(df, fileNumber='2')

Data has been written to c:\data1.xlsx as xls
Data has been written to c:\data2.csv as csv
