In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
pd.set_option('display.max_columns', None)

>>> import numpy as np
>>> from sklearn.preprocessing import QuantileTransformer
>>> rng = np.random.RandomState(0)
>>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
>>> qt = QuantileTransformer(n_quantiles=10, random_state=0)
>>> qt.fit_transform(X)

>>> import numpy as np
>>> from sklearn.preprocessing import SplineTransformer
>>> X = np.arange(6).reshape(6, 1)
>>> spline = SplineTransformer(degree=2, n_knots=3)
>>> spline.fit_transform(X)

>>> from sklearn.preprocessing import KernelCenterer
>>> from sklearn.metrics.pairwise import pairwise_kernels
>>> X = [[ 1., -2.,  2.],
...      [ -2.,  1.,  3.],
...      [ 4.,  1., -2.]]
>>> K = pairwise_kernels(X, metric='linear')
>>> K
array([[  9.,   2.,  -2.],
       [  2.,  14., -13.],
       [ -2., -13.,  21.]])
>>> transformer = KernelCenterer().fit(K)
>>> transformer
KernelCenterer()
>>> transformer.transform(K)
array([[  5.,   0.,  -5.],
       [  0.,  14., -14.],
       [ -5., -14.,  19.]])

>>> from sklearn.preprocessing import MaxAbsScaler
>>> X = [[ 1., -1.,  2.],
...      [ 2.,  0.,  0.],
...      [ 0.,  1., -1.]]
>>> transformer = MaxAbsScaler().fit(X)
>>> transformer
MaxAbsScaler()
>>> transformer.transform(X)
array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

>>> from sklearn.preprocessing import Normalizer
>>> X = [[4, 1, 2, 2],
...      [1, 3, 9, 3],
...      [5, 7, 5, 1]]
>>> transformer = Normalizer().fit(X)  # fit does nothing.
>>> transformer
Normalizer()
>>> transformer.transform(X)
array([[0.8, 0.2, 0.4, 0.4],
       [0.1, 0.3, 0.9, 0.3],
       [0.5, 0.7, 0.5, 0.1]])

In [2]:
diamonds_scaling_win = pd.read_csv('../../../001data/003preprocessing/002feature_engineering/diamonds_fe.csv')
diamonds_scaling_win.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x',
       'y', 'z', 'city', 'xyz/depth', 'xyz/carat', 'depth/table', 'depth/z',
       'depth/carat', 'table/z', 'table/carat', 'carat/z', 'x/z', 'y/z', 'x/y',
       'mass', 'volume', 'density', 'proportion', 'carat/depth', 'carat/table',
       'volume/table', 'volume/depth'],
      dtype='object')

In [3]:
diamonds_test_scaling_win = pd.read_csv('../../../001data/003preprocessing/002feature_engineering/diamonds_test_fe.csv')
cols_diamonds = diamonds_test_scaling_win.columns

In [4]:
#diamonds_scaling_win['scaling'] = 1
diamonds_scaling_win = diamonds_scaling_win[cols_diamonds]
diamonds_scaling_win.info()

diamonds_scaling_win

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 30 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    40455 non-null  int64  
 1   carat         40455 non-null  float64
 2   cut           40455 non-null  object 
 3   color         40455 non-null  object 
 4   clarity       40455 non-null  object 
 5   depth         40455 non-null  float64
 6   table         40455 non-null  float64
 7   x             40455 non-null  float64
 8   y             40455 non-null  float64
 9   z             40455 non-null  float64
 10  city          40455 non-null  object 
 11  xyz/depth     40455 non-null  float64
 12  xyz/carat     40455 non-null  float64
 13  depth/table   40455 non-null  float64
 14  depth/z       40455 non-null  float64
 15  depth/carat   40455 non-null  float64
 16  table/z       40455 non-null  float64
 17  table/carat   40455 non-null  float64
 18  carat/z       40455 non-nu

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,xyz/depth,xyz/carat,depth/table,depth/z,depth/carat,table/z,table/carat,carat/z,x/z,y/z,x/y,mass,volume,density,proportion,carat/depth,carat/table,volume/table,volume/depth
0,0,1.21,Premium,J,VS2,62.4,58.0,6.83,6.79,4.25,Dubai,3.158601,162.889855,1.075862,14.682353,51.570248,13.647059,47.933884,0.284706,1.607059,1.597647,1.005891,0.242,0.068946,3.51,5.455965,0.019391,0.020862,0.001189,0.001105
1,1,0.32,Very Good,H,VS2,63.0,57.0,4.35,4.38,2.75,Kimberly,0.831679,163.736719,1.105263,22.909091,196.875000,20.727273,178.125000,0.116364,1.581818,1.592727,0.993151,0.064,0.018234,3.51,3.464182,0.005079,0.005614,0.000320,0.000289
2,2,0.71,Fair,G,VS1,65.5,55.0,5.62,5.53,3.65,Las Vegas,1.731861,159.770268,1.190909,17.945205,92.253521,15.068493,77.464789,0.194521,1.539726,1.515068,1.016275,0.142,0.040456,3.51,4.257342,0.010840,0.012909,0.000736,0.000618
3,3,0.41,Good,D,SI1,63.8,56.0,4.68,4.72,3.00,Kimberly,1.038696,161.631220,1.139286,21.266667,155.609756,18.666667,136.585366,0.136667,1.560000,1.573333,0.991525,0.082,0.023362,3.51,3.681600,0.006426,0.007321,0.000417,0.000366
4,4,1.02,Ideal,G,SI1,60.5,59.0,6.55,6.51,3.95,Dubai,2.783967,165.127426,1.025424,15.316456,59.313725,14.936709,57.843137,0.258228,1.658228,1.648101,1.006144,0.204,0.058120,3.51,5.397532,0.016860,0.017288,0.000985,0.000961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,1.34,Ideal,G,VS1,62.7,57.0,7.10,7.04,4.43,Antwerp,3.531565,165.245612,1.100000,14.153499,46.791045,12.866817,42.537313,0.302483,1.602709,1.589165,1.008523,0.268,0.076353,3.51,5.641535,0.021372,0.023509,0.001340,0.001218
40451,40451,2.02,Good,F,SI2,57.1,60.0,8.31,8.25,4.73,Madrid,5.679106,160.533156,0.951667,12.071882,28.267327,12.684989,29.702970,0.427061,1.756871,1.744186,1.007273,0.404,0.115100,3.51,7.247093,0.035377,0.033667,0.001918,0.002016
40452,40452,1.01,Ideal,H,SI1,62.7,56.0,6.37,6.42,4.01,Kimberly,2.615479,162.366885,1.119643,15.635910,62.079208,13.965087,55.445545,0.251870,1.588529,1.600998,0.992212,0.202,0.057550,3.51,5.099177,0.016108,0.018036,0.001028,0.000918
40453,40453,0.33,Ideal,J,VS1,61.9,54.3,4.45,4.47,2.76,Kimberly,0.886923,166.365273,1.139963,22.427536,187.575758,19.673913,164.545455,0.119565,1.612319,1.619565,0.995526,0.066,0.018803,3.51,3.603533,0.005331,0.006077,0.000346,0.000304


In [5]:
#diamonds_test_scaling_win['scaling'] = 2


diamonds_test_scaling_win = diamonds_test_scaling_win[cols_diamonds]
diamonds_test_scaling_win

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,xyz/depth,xyz/carat,depth/table,depth/z,depth/carat,table/z,table/carat,carat/z,x/z,y/z,x/y,mass,volume,density,proportion,carat/depth,carat/table,volume/table,volume/depth
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam,2.006489,159.249197,1.045000,17.084469,79.367089,16.348774,75.949367,0.215259,1.585831,1.604905,0.988115,0.158,0.045014,3.51,4.670272,0.012600,0.013167,0.000750,0.000718
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat,3.215235,163.441135,1.070175,14.593301,50.833333,13.636364,47.500000,0.287081,1.629187,1.648325,0.988389,0.240,0.068376,3.51,5.612548,0.019672,0.021053,0.001200,0.001121
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly,3.969111,157.247587,1.019672,13.610503,39.617834,13.347921,38.853503,0.343545,1.614880,1.601751,1.008197,0.314,0.089459,3.51,5.910460,0.025241,0.025738,0.001467,0.001438
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly,2.282032,161.770700,1.181481,16.358974,70.888889,13.846154,60.000000,0.230769,1.561538,1.571795,0.993475,0.180,0.051282,3.51,4.786115,0.014107,0.016667,0.000950,0.000804
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam,1.303615,163.994710,1.084483,19.717868,125.800000,18.181818,116.000000,0.156740,1.583072,1.595611,0.992141,0.100,0.028490,3.51,4.028918,0.007949,0.008621,0.000491,0.000453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam,1.517360,164.780000,1.105357,18.757576,108.596491,16.969697,98.245614,0.172727,1.621212,1.612121,1.005639,0.114,0.032479,3.51,4.312424,0.009208,0.010179,0.000580,0.000525
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City,1.872623,164.052321,1.130909,17.471910,87.605634,15.449438,77.464789,0.199438,1.603933,1.609551,0.996510,0.142,0.040456,3.51,4.595267,0.011415,0.012909,0.000736,0.000650
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv,1.881473,165.569607,1.120000,17.450425,88.000000,15.580737,78.571429,0.198300,1.628895,1.617564,1.007005,0.140,0.039886,3.51,4.650496,0.011364,0.012727,0.000725,0.000648
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat,2.021682,169.821321,1.031579,17.043478,84.000000,16.521739,81.428571,0.202899,1.695652,1.707246,0.993209,0.140,0.039886,3.51,4.993696,0.011905,0.012281,0.000700,0.000678


In [6]:
scaler = RobustScaler()
#scaler = MinMaxScaler()
#scaler = StandardScaler()
#scaler = PowerTransformer()
#scaler = Normalizer()
#scaler = QuantileTransformer(output_distribution = 'normal', random_state=42)

In [7]:
# concat
diamonds_scaling_win = pd.concat([diamonds_scaling_win, diamonds_test_scaling_win])
diamonds_scaling_win.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x',
       'y', 'z', 'city', 'xyz/depth', 'xyz/carat', 'depth/table', 'depth/z',
       'depth/carat', 'table/z', 'table/carat', 'carat/z', 'x/z', 'y/z', 'x/y',
       'mass', 'volume', 'density', 'proportion', 'carat/depth', 'carat/table',
       'volume/table', 'volume/depth'],
      dtype='object')

In [8]:
diamonds_scaling_win = diamonds_scaling_win[['depth', 'table', 'x',
       'y', 'z', 'xyz/depth', 'xyz/carat', 'depth/table', 'depth/z',
       'depth/carat', 'table/z', 'table/carat', 'carat/z', 'x/z', 'y/z', 'x/y',
       'mass', 'volume', 'density', 'proportion', 'carat/depth', 'carat/table',
       'volume/table', 'volume/depth']]

In [9]:
scaled_data = scaler.fit_transform(diamonds_scaling_win)
scaled_data
scaled_df = pd.DataFrame(scaled_data)
scaled_df.set_axis(['depth', 'table', 'x',
       'y', 'z', 'xyz/depth', 'xyz/carat', 'depth/table', 'depth/z',
       'depth/carat', 'table/z', 'table/carat', 'carat/z', 'x/z', 'y/z', 'x/y',
       'mass', 'volume', 'density', 'proportion', 'carat/depth', 'carat/table',
       'volume/table', 'volume/depth'], axis='columns', inplace=True) #, copy = False

  scaled_df.set_axis(['depth', 'table', 'x',


In [10]:
# select data of train, delete test
#scaled_df = scaled_df.loc[scaled_df['scaling']<0]
scaled_df = scaled_df[:40455]
scaled_df

Unnamed: 0,depth,table,x,y,z,xyz/depth,xyz/carat,depth/table,depth/z,depth/carat,table/z,table/carat,carat/z,x/z,y/z,x/y,mass,volume,density,proportion,carat/depth,carat/table,volume/table,volume/depth
0,0.400000,0.333333,0.617486,0.593407,0.637168,0.747865,-0.155324,-0.098806,-0.483519,-0.372428,-0.517892,-0.374274,0.676283,-0.223409,-0.518416,0.708589,0.796875,0.796875,0.0,0.573133,0.751349,0.754596,0.754596,0.751349
1,0.800000,0.000000,-0.737705,-0.730769,-0.690265,-0.588179,0.068200,0.341743,0.911083,1.123460,0.780761,1.069225,-0.667744,-0.864579,-0.639276,-0.181157,-0.593750,-0.593750,0.0,-0.741569,-0.593698,-0.580112,-0.580112,-0.593698
2,2.466667,-0.666667,-0.043716,-0.098901,0.106195,-0.071323,-0.978718,1.625073,0.069602,0.046399,-0.257172,-0.046849,-0.043747,-1.933815,-2.547045,1.433765,0.015625,0.015625,0.0,-0.218033,-0.052327,0.058447,0.058447,-0.052327
3,1.333333,-0.333333,-0.557377,-0.543956,-0.469027,-0.469316,-0.487532,0.851542,0.632658,0.698641,0.402805,0.608653,-0.505647,-1.418810,-1.115708,-0.294660,-0.453125,-0.453125,0.0,-0.598059,-0.467106,-0.430659,-0.430659,-0.467106
4,-0.866667,0.666667,0.464481,0.439560,0.371681,0.532762,0.435268,-0.854581,-0.376026,-0.292710,-0.281344,-0.264404,0.464885,1.076401,0.721045,0.726284,0.500000,0.500000,0.0,0.534564,0.513430,0.441759,0.441759,0.513430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0.600000,0.000000,0.765027,0.730769,0.796460,0.962009,0.466463,0.262879,-0.573171,-0.421629,-0.661004,-0.434108,0.818214,-0.333909,-0.726792,0.892379,1.000000,1.000000,0.0,0.695621,0.937490,0.986270,0.986270,0.937490
40451,-3.133333,1.000000,1.426230,1.395604,1.061947,2.195058,-0.777359,-1.959766,-0.926048,-0.612328,-0.694355,-0.576409,1.812834,3.582162,3.081468,0.805083,2.062500,2.062500,0.0,1.755390,2.253709,1.875423,1.875423,2.253709
40452,0.600000,-0.333333,0.366120,0.390110,0.424779,0.436022,-0.293358,0.557210,-0.321872,-0.264240,-0.459559,-0.290988,0.414127,-0.694117,-0.436109,-0.246723,0.484375,0.484375,0.0,0.337631,0.442845,0.507196,0.507196,0.442845
40453,0.066667,-0.900000,-0.683060,-0.681319,-0.681416,-0.556459,0.761989,0.861693,0.829450,1.027725,0.587554,0.918661,-0.642183,-0.089792,0.020027,-0.015291,-0.578125,-0.578125,0.0,-0.649588,-0.570032,-0.539557,-0.539557,-0.570032


In [11]:
diamonds_test_scaling_win = diamonds_test_scaling_win[['depth', 'table', 'x',
       'y', 'z', 'xyz/depth', 'xyz/carat', 'depth/table', 'depth/z',
       'depth/carat', 'table/z', 'table/carat', 'carat/z', 'x/z', 'y/z', 'x/y',
       'mass', 'volume', 'density', 'proportion', 'carat/depth', 'carat/table',
       'volume/table', 'volume/depth']]

In [12]:
scaled_data_test = scaler.fit_transform(diamonds_test_scaling_win)
scaled_data_test

array([[ 0.53333333,  1.        ,  0.06077348, ...,  0.08164823,
         0.08164823,  0.11020282],
       [-0.6       ,  0.        ,  0.60773481, ...,  0.77751322,
         0.77751322,  0.7783538 ],
       [ 0.2       ,  1.33333333,  0.92265193, ...,  1.19092851,
         1.19092851,  1.30447281],
       ...,
       [-0.2       , -0.66666667,  0.02209945, ...,  0.04287569,
         0.04287569, -0.00656922],
       [-2.06666667,  0.        ,  0.07734807, ...,  0.00346985,
         0.00346985,  0.04455218],
       [ 0.33333333, -0.66666667, -0.55801105, ..., -0.43843855,
        -0.43843855, -0.47452667]])

In [13]:
scaled_df_test = pd.DataFrame(scaled_data_test)
scaled_df_test.set_axis(['depth', 'table', 'x',
       'y', 'z', 'xyz/depth', 'xyz/carat', 'depth/table', 'depth/z',
       'depth/carat', 'table/z', 'table/carat', 'carat/z', 'x/z', 'y/z', 'x/y',
       'mass', 'volume', 'density', 'proportion', 'carat/depth', 'carat/table',
       'volume/table', 'volume/depth'], axis='columns', inplace=True)

  scaled_df_test.set_axis(['depth', 'table', 'x',


In [14]:
scaled_df_test

Unnamed: 0,depth,table,x,y,z,xyz/depth,xyz/carat,depth/table,depth/z,depth/carat,table/z,table/carat,carat/z,x/z,y/z,x/y,mass,volume,density,proportion,carat/depth,carat/table,volume/table,volume/depth
0,0.533333,1.000000,0.060773,0.094444,0.125000,0.081512,-1.116553,-0.555291,-0.072266,-0.084177,-0.015228,-0.064655,0.122389,-0.769944,-0.333893,-0.535731,0.140625,0.140625,0.0,0.047837,0.110203,0.081648,0.081648,0.110203
1,-0.600000,0.000000,0.607735,0.650000,0.580357,0.780535,-0.006204,-0.182063,-0.498331,-0.380789,-0.520035,-0.385061,0.698465,0.338002,0.731040,-0.516637,0.781250,0.781250,0.0,0.674289,0.778354,0.777513,0.777513,0.778354
2,0.200000,1.333333,0.922652,0.888889,0.928571,1.216504,-1.646734,-0.930780,-0.666419,-0.497376,-0.573717,-0.482441,1.151350,-0.027611,-0.411250,0.866163,1.359375,1.359375,0.0,0.872350,1.304473,1.190929,1.190929,1.304473
3,1.266667,-1.000000,0.209945,0.227778,0.330357,0.240859,-0.448664,1.468062,-0.196348,-0.172309,-0.480990,-0.244282,0.246795,-1.390739,-1.145940,-0.161596,0.312500,0.312500,0.0,0.124852,0.252563,0.390492,0.390492,0.252563
4,0.666667,0.333333,-0.364641,-0.350000,-0.303571,-0.324963,0.140425,0.030045,0.378125,0.398500,0.325920,0.386410,-0.346983,-0.840449,-0.561821,-0.254672,-0.312500,-0.312500,0.0,-0.378555,-0.329146,-0.319493,-0.319493,-0.329146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.000000,-0.333333,-0.198895,-0.222222,-0.205357,-0.201353,0.348431,0.339510,0.213886,0.219667,0.100332,0.186454,-0.218750,0.134215,-0.156900,0.687612,-0.203125,-0.203125,0.0,-0.190072,-0.210179,-0.182024,-0.182024,-0.210179
13481,0.200000,-0.666667,0.000000,0.005556,0.026786,0.004097,0.155685,0.718320,-0.006002,0.001464,-0.182603,-0.047587,-0.004506,-0.307362,-0.219947,0.050273,0.015625,0.015625,0.0,-0.002029,-0.001737,0.058920,0.058920,-0.001737
13482,-0.200000,-0.666667,0.022099,-0.005556,0.000000,0.009214,0.557580,0.556592,-0.009677,0.005564,-0.158167,-0.035124,-0.013633,0.330555,-0.023417,0.782985,0.000000,0.000000,0.0,0.034688,-0.006569,0.042876,0.042876,-0.006569
13483,-2.066667,0.000000,0.077348,0.094444,-0.071429,0.090298,1.683762,-0.754260,-0.079277,-0.036017,0.016963,-0.002946,0.023249,2.036523,2.176131,-0.180157,0.000000,0.000000,0.0,0.262858,0.044552,0.003470,0.003470,0.044552


In [15]:
stop

NameError: name 'stop' is not defined

In [16]:
scaled_df.to_csv('../../../001data/003preprocessing/004scaling/diamonds_scaling_num_without_carat.csv')

In [17]:
scaled_df_test.to_csv('../../../001data/003preprocessing/004scaling/diamonds_test_scaling_num_without_carat.csv')