In [149]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [150]:
# load data
data = pd.read_csv('data_section2/iris_cleaned.csv')
data

Unnamed: 0,measurement.number,sepal.length,sepal.width,petal.length,petal.width,variety
0,1,5.1,3.5,1.4,0.2,Setosa
1,2,4.9,3.0,1.4,0.2,Setosa
2,3,4.7,3.2,1.3,0.2,Setosa
3,4,4.6,3.1,1.5,0.2,Setosa
4,5,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...,...
144,146,6.7,3.0,5.2,2.3,Virginica
145,147,6.3,2.5,5.0,1.9,Virginica
146,148,6.5,3.0,5.2,2.0,Virginica
147,149,6.2,3.4,5.4,2.3,Virginica


In [151]:
# standardize the data
scaler = StandardScaler()
data_standardized = scaler.fit_transform(data[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']])
# transform the np array back to a dataframe
data_standardized = pd.DataFrame(data_standardized, columns=['sepal.length', 'sepal.width', 'petal.length', 'petal.width'])
data_standardized

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,-0.898033,1.017229,-1.352906,-1.308624
1,-1.139562,-0.134497,-1.352906,-1.308624
2,-1.381091,0.326193,-1.410370,-1.308624
3,-1.501855,0.095848,-1.295442,-1.308624
4,-1.018798,1.247574,-1.352906,-1.308624
...,...,...,...,...
144,1.034197,-0.134497,0.830718,1.454321
145,0.551140,-1.286222,0.715791,0.928046
146,0.792668,-0.134497,0.830718,1.059614
147,0.430375,0.786884,0.945646,1.454321


In [152]:
# verify the mean and standard deviation
print(data_standardized.mean())
print(data_standardized.std())

# mean should be ~0 and standard deviation should be ~1

sepal.length    1.049124e-15
sepal.width    -3.338120e-16
petal.length   -4.291869e-16
petal.width    -4.768743e-16
dtype: float64
sepal.length    1.003373
sepal.width     1.003373
petal.length    1.003373
petal.width     1.003373
dtype: float64


In [153]:
# normalization 
min_max_scaler = MinMaxScaler()
data_normalized = min_max_scaler.fit_transform(data[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']])
# transform the np array back to a dataframe
data_normalized = pd.DataFrame(data_normalized, columns=['sepal.length', 'sepal.width', 'petal.length', 'petal.width'])


In [154]:
# minimum (0) and maximum values (1)
print(data_normalized.min())
print(data_normalized.max())

sepal.length    0.0
sepal.width     0.0
petal.length    0.0
petal.width     0.0
dtype: float64
sepal.length    1.0
sepal.width     1.0
petal.length    1.0
petal.width     1.0
dtype: float64


In [155]:
# new feature petal_ratio
data_standardized['petal_ratio'] = data_standardized['petal.length'] / data_standardized['petal.width']
data_standardized['sepal_ratio'] = data_standardized['sepal.length'] / data_standardized['sepal.width']
data_standardized

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,petal_ratio,sepal_ratio
0,-0.898033,1.017229,-1.352906,-1.308624,1.033839,-0.882823
1,-1.139562,-0.134497,-1.352906,-1.308624,1.033839,8.472782
2,-1.381091,0.326193,-1.410370,-1.308624,1.077750,-4.233964
3,-1.501855,0.095848,-1.295442,-1.308624,0.989927,-15.669088
4,-1.018798,1.247574,-1.352906,-1.308624,1.033839,-0.816623
...,...,...,...,...,...,...
144,1.034197,-0.134497,0.830718,1.454321,0.571207,-7.689381
145,0.551140,-1.286222,0.715791,0.928046,0.771288,-0.428495
146,0.792668,-0.134497,0.830718,1.059614,0.783982,-5.893585
147,0.430375,0.786884,0.945646,1.454321,0.650232,0.546936


In [156]:
# is petal_ratio standardized?
print(data_standardized['petal_ratio'].mean())
print(data_standardized['petal_ratio'].std())

2.3895225441863976
8.185198999462717


In [157]:
# petal_ratio is not standardized
# standardize petal_ratio
data_standardized['petal_ratio'] = scaler.fit_transform(data_standardized[['petal_ratio']])
data_standardized

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,petal_ratio,sepal_ratio
0,-0.898033,1.017229,-1.352906,-1.308624,-0.166185,-0.882823
1,-1.139562,-0.134497,-1.352906,-1.308624,-0.166185,8.472782
2,-1.381091,0.326193,-1.410370,-1.308624,-0.160802,-4.233964
3,-1.501855,0.095848,-1.295442,-1.308624,-0.171568,-15.669088
4,-1.018798,1.247574,-1.352906,-1.308624,-0.166185,-0.816623
...,...,...,...,...,...,...
144,1.034197,-0.134497,0.830718,1.454321,-0.222896,-7.689381
145,0.551140,-1.286222,0.715791,0.928046,-0.198369,-0.428495
146,0.792668,-0.134497,0.830718,1.059614,-0.196813,-5.893585
147,0.430375,0.786884,0.945646,1.454321,-0.213209,0.546936


In [158]:
# verify the mean and standard deviation of petal_ratio
print(data_standardized['petal_ratio'].mean())
print(data_standardized['petal_ratio'].std())

1.1921857982552016e-17
1.0033726908565714


In [159]:
# compute mean ratios for different varieties
mean_ratios = data.groupby('variety').mean()
mean_ratios['petal_ratio'] = mean_ratios['petal.length'] / mean_ratios['petal.width']
mean_ratios

Unnamed: 0_level_0,measurement.number,sepal.length,sepal.width,petal.length,petal.width,petal_ratio
variety,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Setosa,25.5,5.006,3.424,1.518,0.246,6.170732
Versicolor,75.5,5.936,2.77,4.26,1.326,3.21267
Virginica,125.142857,6.604082,2.979592,5.520408,2.028571,2.721328
