# Instructor Do: Preparing Data for Neural Networks

In [1]:
# Initial imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler


## Loading the data

For this demo, we will use the [iris dataset](https://archive.ics.uci.edu/ml/datasets/iris). This dataset contains information about three different types of iris flowers (Setosa, Versicolour, and Virginica).

The rows are the samples, and the columns depict the flowers' features: Sepal Length, Sepal Width, Petal Length, and Petal Width.

In [2]:
# Upload data to Colab
from google.colab import files

csv_file = files.upload()

Saving iris.csv to iris.csv


In [3]:
# Load the data into a DataFrame
data = pd.read_csv('iris.csv')
data.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## One Hot Encoding

In [4]:
# Create the OneHotEncoder instance
enc = OneHotEncoder()



In [5]:
# Fit the OneHotEncoder
class_values = data["class"].values.reshape(-1, 1)
enc.fit(class_values)



OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [6]:
# Fetch the categories identified by the OneHotEncoder
enc.categories_



[array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)]

In [7]:
# Encode iris categories
class_encoded = enc.transform(class_values).toarray()



In [8]:
# Create a DataFrame with the encoded class data
class_encoded_df = pd.DataFrame(
    class_encoded, columns=["setosa", "versicolor", "virginica"]
)
class_encoded_df.head()



Unnamed: 0,setosa,versicolor,virginica
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


In [9]:
# Some samples of the encoded data
display(class_encoded_df.iloc[1:3])
display(class_encoded_df.iloc[80:82])
display(class_encoded_df.iloc[100:102])



Unnamed: 0,setosa,versicolor,virginica
1,1.0,0.0,0.0
2,1.0,0.0,0.0


Unnamed: 0,setosa,versicolor,virginica
80,0.0,1.0,0.0
81,0.0,1.0,0.0


Unnamed: 0,setosa,versicolor,virginica
100,0.0,0.0,1.0
101,0.0,0.0,1.0


### Scaling Data

In [10]:
# Select the columns with numerical values to be scaled
data_to_scale = data.iloc[:, :4]
data_to_scale.head()



Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [11]:
# Create the StandardScaler instances
scaler = StandardScaler()



In [12]:
# Fit the StandardScaler
scaler.fit(data_to_scale)



StandardScaler(copy=True, with_mean=True, with_std=True)

In [13]:
# Scale the data
scaled_data = scaler.transform(data_to_scale)



In [14]:
# Create a DataFrame with the scaled data
features_scaled_data = pd.DataFrame(scaled_data, columns=data.iloc[:, :4].columns)
features_scaled_data.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.26346,-1.341272,-1.312977
