In [None]:
import numpy
from pandas import DataFrame
from matplotlib import pyplot as plotter

from transformations import normalize_dataset
from data_generator import generate_skewed_data_right, generate_skewed_data_left, generate_extreme_skewed_data_right

In [None]:
feature_a = 'feature_a'
feature_b = 'feature_b'
feature_c = 'feature_c'

sqrt = "SQRT"
log = "LOG"
reverse = "1/N"

In [None]:
# Variables

plot_hist_color = 'cyan'
plot_hist_alpha = 0.63
plot_bar_count = 1000

size = 100000

In [None]:
df_data = DataFrame({feature_a: generate_skewed_data_left(size=size),
                     feature_b: generate_skewed_data_right(size=size),
                     feature_c: generate_extreme_skewed_data_right(size=size)})

df_data = normalize_dataset(df=df_data, 
                            all_pos=True)

In [None]:
df_data.head()

In [None]:
# Plot Feature A

plotter.hist(df_data['feature_a'], 
             plot_bar_count, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
plotter.hist(df_data['feature_b'], 
             plot_bar_count, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
plotter.hist(df_data['feature_c'], 
             plot_bar_count, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Apply Transformations on all Features
#   * Sqrt Transformation
#   * Logarithmic Transformation
#   * 1/N Transformation

for column in df_data:
    df_data[column + "_" + sqrt] = numpy.sqrt(df_data[column])
    df_data[column + "_" + log] = numpy.log2(df_data[column])
    df_data[column + "_" + reverse] = numpy.power(df_data[column], -1.0)  


In [None]:
# Show Plots
#   * Sqrt on Feature A 

plotter.hist(df_data[feature_a + "_" + sqrt], 
             plot_bar_count, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Show Plots
#   * Log on Feature A 

plotter.hist(df_data[feature_a + "_" + log], 
             plot_bar_count, 
             range=(-1.0, 0.0), 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Show Plots
#   * Reverse on Feature A 

plotter.hist(df_data[feature_a + "_" + reverse], 
             plot_bar_count, 
             range=(1.0, 3.0), 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
df_data.head(1000)

In [None]:
# Show Plots
#   * Sqrt on Feature B 

plotter.hist(df_data[feature_b + "_" + sqrt], 
             plot_bar_count, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Show Plots
#   * Log on Feature B 

plotter.hist(df_data[feature_b + "_" + log], 
             plot_bar_count, 
             range= (-5.0, 0.0), 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Show Plots
#   * Reverse on Feature B 

plotter.hist(df_data[feature_b + "_" + reverse], 
             plot_bar_count, 
             range=(0.0, 20.0), 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Show Plots
#   * Sqrt on Feature C 

plotter.hist(df_data[feature_c + "_" + sqrt], 
             plot_bar_count, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Show Plots
#   * Log on Feature C 

plotter.hist(df_data[feature_c + "_" + log], 
             plot_bar_count, 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);

In [None]:
# Show Plots
#   * Reverse on Feature C 

plotter.hist(df_data[feature_c + "_" + reverse], 
             plot_bar_count, 
             range=(0.0, 1000.0), 
             density=True, 
             color=plot_hist_color, 
             alpha=plot_hist_alpha);