# FEATURE SCALING EXERCISE

# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# Problem: The Boston House Pricing

## Dataset Description

Scikit-learn toy datasets: scikit-learn package of Python comes with a few small standard datasets that do not require to download any file from some external website.
https://scikit-learn.org/stable/datasets/toy_dataset.html

This Boston House Pricingdata has been taken from a reaserch paper, which can be found at 
[Source: Original research paper](https://deepblue.lib.umich.edu/bitstream/handle/2027.42/22636/0000186.pdf?sequence=1&isAllowed=y)

## Loading Data

In [None]:
#loading the dataset from sklearn.
boston_dataset = load_boston()

## Exploring Data

In [None]:
#Let us view what we have downloaded using the following command. 
boston_dataset
#It displays the downloaded dataset in its raw form.

In [None]:
#Let us see the type of 'data'.
type(boston_dataset)
#It shows a special object type bunch.

In [None]:
#dir function gives list of all the attributes and methods for this object bunch.
dir(boston_dataset)

In [None]:
#Let us explore these attributes one by one.
#Printing description of the dataset using the DESCR attribute.
print(boston_dataset.DESCR)

In [None]:
#Printing the 'data' attribute.
print(boston_dataset.data)
#These are 13 attributes stored in a ndarray (try writing command to check/verify this datatype).

In [None]:
#Printing the 'feature_names' attribute.
print(boston_dataset.feature_names)

In [None]:
#Printing the 'file_name' attribute.
print(boston_dataset.filename)
#This is the csv file that we have downloaded.

In [None]:
#Printing the 'target' attribute.
print(boston_dataset.target)
#Target is another ndarray; use type function to verify this.

In [None]:
#TASK FOR YOU
#Print shape of the two ndarrays of features (boston_dataset.data) and target (boston_dataset.target).


## Creating Dataframe

In [None]:
#Let us create a dataframe from the two ndarrays 'data' and 'target' from boston_dataset.
dataset = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
#Add column with the price (target).
dataset['PRICE'] = boston_dataset.target

In [None]:
#let's see the dataframe we have created.
dataset

In [None]:
#TASK FOR YOU
#Run info and describe methods on this dataframe to explore the statistics info


## Scaling Data - Normalization

In [None]:
#Let us normalize the PTRATIO column using the normalization formula \discussed in class.
dataset['PTRATIO']=(dataset['PTRATIO']-dataset['PTRATIO'].min())/(dataset['PTRATIO'].max()-dataset['PTRATIO'].min())
#Let us check the dataset
dataset

In [None]:
#Let us check the min and max value of this normalized column now.
print('Minimum value is',dataset['PTRATIO'].min())
print('Maximum value is',dataset['PTRATIO'].max())

In [None]:
#An alternate way is to use methods from sklearn package.
#Let us use this alternate method now to normalize LSTAT column.

#It is a 2-step process. We define an object of MinMaxScaler type first,
#and then call the method fit_transform on this object to perform normalization.
#The fit_transform method takes and ndarray of shape (n_samples, n_features).

#Defining min-max scaler object
minmax_scaler = MinMaxScaler()

#Normailize data 
#dataset['LSTAT']=pd.DataFrame(minmax_scaler.fit_transform(dataset[['LSTAT']]))
#Note that fit_transform method returns a ndarray, so we need to typecast it back to dataframe before placong it back into out dataset.

#this method can be run on whole dataset at once to normalize the all the columns in one go.
#Try it ouit if you want using the following command.
column_headers=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','PRICE']
dataset=pd.DataFrame(minmax_scaler.fit_transform(dataset),columns=column_headers)
dataset

In [None]:
#Let us check the min and max value of this normalized column now.
print('Minimum value is',dataset['LSTAT'].min())
print('Maximum value is',dataset['LSTAT'].max())

## Scaling Data - Standardization

In [None]:
#Let us run describe method and observe standard deviation and mean of various features.
dataset.describe()
# see row 2 and row 3 for mean and standard deviation respectively.

In [None]:
#Let us atndardize the ZN column.
#std and mean are the methods that can be used to find mean and standard deviation of a feature or all features.
dataset['ZN']=(dataset['ZN']-dataset['ZN'].mean())/dataset['ZN'].std()
#Let us check the dataset description now.
dataset.describe()

In [None]:
#Let us view the dataset too.
dataset

In [None]:
#Let us check the min and max value of this normalized column now.
print('Minimum value is',dataset['ZN'].min())
print('Maximum value is',dataset['ZN'].max())

In [None]:
#An alternate way is to use methods from skleyarn package.
#Let us use this alternate method now to normalize INDUS column.

#It is a 2-step process. We define an object of StandardScaler type first,
#and then call the method fit_transform on this object to perform standardization.
#The fit_transform method takes and ndarray of shape (n_samples, n_features).

#Defining standard scaler object
stan_scaler = StandardScaler()

#Normailize data 
dataset['INDUS']=pd.DataFrame(stan_scaler.fit_transform(dataset[['INDUS']]))
#Note that fit_transform method returns a ndarray, so we need to typecast it back to dataframe before placong it back into out dataset.

#this method can be run on whole dataset at once to normalize the all the columns in one go.
#Try it ouit if you want using the following command.
#dataset=pd.DataFrame(stan_scaler.fit_transform(dataset))

In [None]:
#Let us check the dataset description now.
dataset.describe()

In [None]:
#Let us check the min and max value of this normalized column now.
print('Minimum value is',dataset['INDUS'].min())
print('Maximum value is',dataset['INDUS'].max())
#observe the range is pretty much similar to that of ZN that we standardized earlier.