In [None]:
import matplotlib.pyplot as plotter
from transformations import normalize_dataset, standardize_dataset, box_cox, yeo_johnson
from pandas import DataFrame, read_csv
from scipy.stats import pearsonr

In [None]:
# Initialize Variables

# Plotting variables
size = 1000
plot_hist_color = 'cyan'
plot_hist_alpha = 0.63

# Problem Specific Variables
output_key = 'OUTPUT'
milage_key = 'MIL'
year_key = 'YEAR'
fuel_consumption_key = 'FCON'
condition_key = 'COND'

# Data Path
file_path = "data/carprices.csv"

In [None]:
chosen_key = fuel_consumption_key

In [None]:
# Load and Present Dataset

# Load
df_raw = read_csv(file_path)

# Show
df_raw.head()

In [None]:
# Stats, Raw Data

print("Average: " + str(df_raw[chosen_key].sum() / len(df_raw[chosen_key])))
print("Min: " + str(df_raw[chosen_key].min()))
print("Max: " + str(df_raw[chosen_key].max()))

In [None]:
# Plot Raw Data Distribution

plotter.hist(df_raw[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Normalize Dataset

df_norm = normalize_dataset(df_raw, 
                            all_pos=True)

In [None]:
# Show Normalized Data

df_norm.head()

In [None]:
# Stats, Normalized Data

print("Average: " + str(df_norm[chosen_key].sum() / len(df_norm[chosen_key])))
print("Min: " + str(df_norm[chosen_key].min()))
print("Max: " + str(df_norm[chosen_key].max()))

In [None]:
# Plot Normalized Data Distribution

plotter.hist(df_norm[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Standardize Dataset

df_stan = standardize_dataset(df_raw)

In [None]:
# Show Standardized Data

df_stan.head()

In [None]:
# Stats, Standardized Data

print("Average: " + str(df_stan[chosen_key].sum() / len(df_stan[chosen_key])))
print("Min: " + str(df_stan[chosen_key].min()))
print("Max: " + str(df_stan[chosen_key].max()))

In [None]:
# Plot Standardized Data Distribution

plotter.hist(df_stan[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Box Cox, Raw out

df_bc_raw = DataFrame(box_cox(DataFrame(df_norm[chosen_key], 
                                        columns=[chosen_key])), 
                      columns=[chosen_key])

plotter.hist(df_bc_raw[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Box Cox, Normalize After

df_bc_norm = DataFrame(box_cox(DataFrame(df_norm[chosen_key], columns=[chosen_key]), normalize=True), columns=[chosen_key])

plotter.hist(df_bc_norm[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Box Cox, Standardize After

df_bc_stan = DataFrame(box_cox(DataFrame(df_norm[chosen_key], 
                                         columns=[chosen_key]), 
                               standardize=True), 
                       columns=[chosen_key])

plotter.hist(df_bc_stan[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Box Cox Data Inspection

df_temp = DataFrame({'raw': df_raw[chosen_key],
                     'norm': df_norm[chosen_key],
                     'bc_raw': df_bc_raw[chosen_key],
                     'bc_norm': df_bc_norm[chosen_key],
                     'bc_stan': df_bc_stan[chosen_key]})

df_temp.head()

In [None]:
# Yeo Johnson, Raw

df_yj_raw = DataFrame(yeo_johnson(DataFrame(df_norm[chosen_key], 
                                            columns=[chosen_key])), 
                      columns=[chosen_key])

plotter.hist(df_yj_raw[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
df_yj_norm = DataFrame(yeo_johnson(DataFrame(df_norm[chosen_key], columns=[chosen_key]), normalize=True), columns=[chosen_key])

plotter.hist(df_yj_norm[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
df_yj_stan = DataFrame(yeo_johnson(DataFrame(df_norm[chosen_key], columns=[chosen_key]), standardize=True), columns=[chosen_key])

plotter.hist(df_yj_stan[chosen_key], 
             25, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Yeo Johnson Inspection

df_temp = DataFrame({'raw': df_raw[chosen_key],
                     'norm': df_norm[chosen_key],
                     'bc_raw': df_bc_raw[chosen_key],
                     'bc_norm': df_bc_norm[chosen_key],
                     'bc_stan': df_bc_stan[chosen_key],
                     'yj_raw': df_yj_raw[chosen_key],
                     'yj_norm': df_yj_norm[chosen_key],
                     'yj_stan': df_yj_stan[chosen_key]})

df_temp.head()

In [None]:
print("Correlation (RAW): " + str(pearsonr(df_raw[chosen_key], df_raw[output_key])[0]))
print("Correlation (Norm): " + str(pearsonr(df_norm[chosen_key], df_norm[output_key])[0]))
print("Correlation (Stan): " + str(pearsonr(df_stan[chosen_key], df_stan[output_key])[0]))
print("Box Cox Transformation (RAW)" + str(str(pearsonr(df_bc_raw[chosen_key], df_norm[output_key])[0])))
print("Box Cox Transformation (Norm)" + str(str(pearsonr(df_bc_norm[chosen_key], df_norm[output_key])[0])))
print("Box Cox Transformation (Stan)" + str(str(pearsonr(df_bc_stan[chosen_key], df_norm[output_key])[0])))
print("Yeo Johnson Transformation (RAW)" + str(str(pearsonr(df_yj_raw[chosen_key], df_norm[output_key])[0])))
print("Yeo Johnson Transformation (Norm)" + str(str(pearsonr(df_yj_norm[chosen_key], df_norm[output_key])[0])))
print("Yeo Johnson Transformation (Stan)" + str(str(pearsonr(df_yj_stan[chosen_key], df_norm[output_key])[0])))