## Standardization

Prepare the dataset by performing the preprocessing techniques, to have the standard scale to data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import seaborn as sns
import numpy as np

In [2]:
sns.set()

In [3]:
seeds = pd.read_csv('Seeds_data.csv')

### Exploratory Data Analysis

In [4]:
seeds.sample(5)

Unnamed: 0,Area,Perimeter,Compactness,length,Width,Assymetry_coeff,len_ker_grove,Type
102,19.46,16.5,0.8985,6.113,3.892,4.308,6.009,2
38,14.8,14.52,0.8823,5.656,3.288,3.112,5.309,1
117,19.15,16.45,0.889,6.245,3.815,3.084,6.185,2
133,16.16,15.33,0.8644,5.845,3.395,4.266,5.795,2
5,14.38,14.21,0.8951,5.386,3.312,2.462,4.956,1


In [5]:
seeds.shape

(210, 8)

In [6]:
seeds.dtypes

Area               float64
Perimeter          float64
Compactness        float64
length             float64
Width              float64
Assymetry_coeff    float64
len_ker_grove      float64
Type                 int64
dtype: object

In [7]:
seeds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             210 non-null    float64
 1   Perimeter        210 non-null    float64
 2   Compactness      210 non-null    float64
 3   length           210 non-null    float64
 4   Width            210 non-null    float64
 5   Assymetry_coeff  210 non-null    float64
 6   len_ker_grove    210 non-null    float64
 7   Type             210 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 13.2 KB


Animals , Gender   ,Homly    and Types     are nominal features. We will apply onehot encoding to these features.

In [8]:
seeds.isnull().sum().sum()

0

In [9]:
seeds.duplicated().sum()

0

### Data Preprocessing

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Create a columntransformer with StandardScaler to scale mean to zero
preprocessor = ColumnTransformer(
    transformers=[
        ("scaling", StandardScaler(), slice(0,8)),
    ]
)


### Create Preprocessing Pipeline

In [11]:
#Create a preprocessing pipeline
pipe = Pipeline(
    steps=[("preprocessor", preprocessor)]
)

In [12]:
# Apply the tranformation
seeds_transformed = pipe.fit_transform(seeds) 

In [13]:
seeds_transformed.shape

(210, 8)

In [14]:
#View transformed data
seeds_transformed

array([[ 1.42097769e-01,  2.15462437e-01,  6.06017918e-05, ...,
        -9.86151745e-01, -3.83577423e-01, -1.22474487e+00],
       [ 1.11880257e-02,  8.22375713e-03,  4.28515270e-01, ...,
        -1.78816620e+00, -9.22013487e-01, -1.22474487e+00],
       [-1.92066576e-01, -3.60200562e-01,  1.44238325e+00, ...,
        -6.67479334e-01, -1.18919199e+00, -1.22474487e+00],
       ...,
       [-5.67570840e-01, -6.90247348e-01,  7.33948301e-01, ...,
         3.07658816e+00, -7.18060432e-01,  1.22474487e+00],
       [-1.03608992e+00, -1.03564515e+00, -8.01701104e-01, ...,
        -6.81351965e-02, -7.42534799e-01,  1.22474487e+00],
       [-8.77620233e-01, -9.35863561e-01, -1.10234659e-01, ...,
         1.29122264e+00, -7.03783718e-01,  1.22474487e+00]])

In [15]:
#convert array to dataframe 
seeds_transformed_df= pd.DataFrame(seeds_transformed)

In [16]:
#add column names
seeds_transformed_df.columns= seeds.columns

In [17]:
#View standardized dataframe
seeds_transformed_df

Unnamed: 0,Area,Perimeter,Compactness,length,Width,Assymetry_coeff,len_ker_grove,Type
0,0.142098,0.215462,0.000061,0.304218,0.140671,-0.986152,-0.383577,-1.224745
1,0.011188,0.008224,0.428515,-0.168625,0.196494,-1.788166,-0.922013,-1.224745
2,-0.192067,-0.360201,1.442383,-0.763637,0.207127,-0.667479,-1.189192,-1.224745
3,-0.347091,-0.475333,1.039381,-0.688978,0.318773,-0.960818,-1.229983,-1.224745
4,0.445257,0.330595,1.374509,0.066666,0.805230,-1.563495,-0.475356,-1.224745
...,...,...,...,...,...,...,...,...
205,-0.915515,-1.043321,0.309736,-1.112048,-0.739205,-0.046135,-1.097413,1.224745
206,-1.246235,-1.288937,-0.844122,-1.105261,-1.233636,0.416540,-0.826156,1.224745
207,-0.567571,-0.690247,0.733948,-0.888070,-0.071988,3.076588,-0.718060,1.224745
208,-1.036090,-1.035645,-0.801701,-1.026077,-1.124649,-0.068135,-0.742535,1.224745
