In [19]:
import numpy as np
import pandas as pd
from wrangle import wrangle_telco
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler


In [2]:
tc_df = wrangle_telco()
X = tc_df[['monthly_charges', 'tenure']]
y = tc_df[['total_charges']]

In [12]:
def split_my_data(X, y, train_pct):
    return train_test_split(X, y, train_size = train_pct, random_state = 294)

In [16]:
X_train, X_test, y_train, y_test = split_my_data(X, y, 0.8)
X_train

Unnamed: 0_level_0,monthly_charges,tenure
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3460-TJBWI,24.20,24
8295-KMENE,76.45,59
8039-EQPIM,60.25,69
3777-XROBG,19.55,58
4884-TVUQF,101.30,57
...,...,...
0847-HGRML,20.00,62
8532-UEFWH,25.75,52
5879-HMFFH,88.05,72
4853-RULSV,104.00,70


In [20]:
def standard_scaler():
    train_std = X_train.copy()
    test_std = X_test.copy()
    std_scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train_std)
    train_scaled_std = pd.DataFrame(std_scaler.transform(train_std), columns=train_std.columns.values).set_index([train_std.index.values])
    test_scaled_std = pd.DataFrame(std_scaler.transform(test_std), columns=test_std.columns.values).set_index([test_std.index.values])
    return std_scaler, train_scaled_std, test_scaled_std

In [23]:
std_scaler, train_scaled_std, test_scaled_std = standard_scaler()

In [24]:
train_scaled_std

Unnamed: 0,monthly_charges,tenure
3460-TJBWI,-1.046584,-1.809541
8295-KMENE,0.468623,0.118199
8039-EQPIM,-0.001163,0.668982
3777-XROBG,-1.181430,0.063121
4884-TVUQF,1.189253,0.008042
...,...,...
0847-HGRML,-1.168380,0.283434
8532-UEFWH,-1.001635,-0.267349
5879-HMFFH,0.805014,0.834217
4853-RULSV,1.267551,0.724060


In [29]:
def scale_inverse():
    train_unscaled = pd.DataFrame(std_scaler.inverse_transform(train_scaled_std), columns=train_scaled_std.columns.values).set_index([train_scaled_std.index.values])
    test_unscaled = pd.DataFrame(std_scaler.inverse_transform(test_scaled_std), columns=test_scaled_std.columns.values).set_index([test_scaled_std.index.values])
    return train_unscaled, test_unscaled

In [30]:
train_unscaled, test_unscaled = scale_inverse()

In [31]:
train_unscaled

Unnamed: 0,monthly_charges,tenure
3460-TJBWI,24.20,24.0
8295-KMENE,76.45,59.0
8039-EQPIM,60.25,69.0
3777-XROBG,19.55,58.0
4884-TVUQF,101.30,57.0
...,...,...
0847-HGRML,20.00,62.0
8532-UEFWH,25.75,52.0
5879-HMFFH,88.05,72.0
4853-RULSV,104.00,70.0


In [34]:
def uniform_scaler(train, test):
    unf_scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train)
    train_scaled_unf = pd.DataFrame(unf_scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled_unf = pd.DataFrame(unf_scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return unf_scaler, train_scaled_unf, test_scaled_unf

In [36]:
unf_scaler, train_scaled_unf, test_scaled_unf = uniform_scaler(X_train, X_test)
train_scaled_unf

Unnamed: 0,monthly_charges,tenure
3460-TJBWI,0.256466,0.090909
8295-KMENE,0.602984,0.388889
8039-EQPIM,0.479185,0.641414
3777-XROBG,0.050505,0.373737
4884-TVUQF,0.838384,0.353535
...,...,...
0847-HGRML,0.144216,0.444444
8532-UEFWH,0.358955,0.287879
5879-HMFFH,0.725651,1.000000
4853-RULSV,0.848688,0.686869


In [37]:
def gaussian_scaler(train, test):
    gs_scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train)
    train_scaled_gs = pd.DataFrame(gs_scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled_gs = pd.DataFrame(gs_scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return gs_scaler, train_scaled_gs, test_scaled_gs

In [39]:
gs_scaler, train_scaled_gs, test_scaled_gs = gaussian_scaler(X_train, X_test)
train_scaled_gs

Unnamed: 0,monthly_charges,tenure
3460-TJBWI,6.054234,601.783587
8295-KMENE,10.412348,4275.197191
8039-EQPIM,9.352131,6037.452765
3777-XROBG,5.433135,4117.299434
4884-TVUQF,11.790033,3962.682642
...,...,...
0847-HGRML,5.496969,4768.707414
8532-UEFWH,6.244684,3238.338929
5879-HMFFH,11.086234,6632.227431
4853-RULSV,11.926133,6232.277842


In [41]:
def min_max_scaler(train, test):
    mm_scaler = MinMaxScaler(copy=True, feature_range=(0,1)).fit(train)
    train_scaled_mm = pd.DataFrame(mm_scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled_mm = pd.DataFrame(mm_scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return mm_scaler, train_scaled_mm, test_scaled_mm

In [43]:
mm_scaler, train_scaled_mm, test_scaled_mm = min_max_scaler(X_train, X_test)
train_scaled_mm

Unnamed: 0,monthly_charges,tenure
3460-TJBWI,0.057855,0.333333
8295-KMENE,0.579052,0.819444
8039-EQPIM,0.417456,0.958333
3777-XROBG,0.011471,0.805556
4884-TVUQF,0.826933,0.791667
...,...,...
0847-HGRML,0.015960,0.861111
8532-UEFWH,0.073317,0.722222
5879-HMFFH,0.694763,1.000000
4853-RULSV,0.853865,0.972222


In [45]:
def iqr_robust_scaler(train, test):
    iqr_scaler = RobustScaler(quantile_range=(25.0,75.0), copy=True, with_centering=True, with_scaling=True).fit(train)
    train_scaled_iqr = pd.DataFrame(iqr_scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled_iqr = pd.DataFrame(iqr_scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return iqr_scaler, train_scaled_iqr, test_scaled_iqr

In [47]:
iqr_scaler, train_scaled_iqr, test_scaled_iqr = iqr_robust_scaler(X_train, X_test)
train_scaled_iqr

Unnamed: 0,monthly_charges,tenure
3460-TJBWI,-0.600604,-1.739130
8295-KMENE,0.188373,-0.217391
8039-EQPIM,-0.056248,0.217391
3777-XROBG,-0.670819,-0.260870
4884-TVUQF,0.563609,-0.304348
...,...,...
0847-HGRML,-0.664024,-0.086957
8532-UEFWH,-0.577199,-0.521739
5879-HMFFH,0.363533,0.347826
4853-RULSV,0.604379,0.260870
