In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
import sklearn.preprocessing
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import wrangle





## 1. Apply the scalers we talked about in this lesson to your data and visualize the results for the unscaled and scaled distribution .

In [2]:
# get the data
df = wrangle.get_zillow_data()

In [3]:
# aquired data
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips,propertylandusetypeid
0,0.0,0.0,,27516.0,,,6037.0,261.0
1,0.0,0.0,,10.0,,,6037.0,261.0
2,0.0,0.0,,10.0,,,6037.0,261.0
3,0.0,0.0,,2108.0,,174.21,6037.0,261.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0,261.0


In [4]:
# Check data to ensure it is all continuous data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152864 entries, 0 to 2152863
Data columns (total 8 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
 7   propertylandusetypeid         float64
dtypes: float64(8)
memory usage: 131.4 MB


In [5]:
df = wrangle.prep_zillow(df)

In [None]:
# Split the datasets
train, validate, test = wrangle.split_zillow(df)

In [None]:
# check it out
print(f"train: {train.shape}")
print(f"validate: {validate.shape}")
print(f"test: {test.shape}")

In [None]:
# visualize before scaling
for col in train.columns:
    plt.figure()
    plt.hist(train[col], bins=100)

In [None]:
#scaling fit
scaler = sklearn.preprocessing.MinMaxScaler()

scaler.fit(train)

train_scaled = scaler.transform(train)
validate_scaled = scaler.transform(validate)
test_scaled = scaler.transform(test)

In [None]:
train_scaled = pd.DataFrame(train_scaled)

In [None]:
# visualize the scaled data
for col in train_scaled.columns:
    plt.figure()
    plt.hist(train_scaled[col], bins=100)


## 2. Apply the .inverse_transform method to your scaled data. Is the resulting dataset the exact same as the original data?

In [None]:
# inverse transform

train_inverse_scaled = scaler.inverse_transform(train_scaled)
validate_inverse_scaled = scaler.inverse_transform(validate_scaled)
test_inverse_scaled = scaler.inverse_transform(test_scaled)

In [None]:
train_inverse_scaled = pd.DataFrame(train_inverse_scaled)

In [None]:
# visualize the scaled data
for col in train_inverse_scaled.columns:
    plt.figure()
    plt.hist(train_inverse_scaled[col])

## 3. Read the documentation for sklearn's QuantileTransformer. Use normal for the output_distribution and apply this scaler to your data. Visualize the result of your data scaling.

In [None]:
# quantile transformer
qt = sklearn.preprocessing.QuantileTransformer(output_distribution='normal')

train_quantile_scaled = qt.fit_transform(train)
validate_quantile_scaled = qt.transform(validate)
test_quantile_scaled = qt.transform(test)



In [None]:
train_quantile_scaled = pd.DataFrame(train_quantile_scaled)

In [None]:
# visualize the scaled data
for col in train_quantile_scaled.columns:
    plt.figure()
    plt.hist(train_quantile_scaled[col])

## 4. Use the QuantileTransformer, but omit the output_distribution argument. Visualize your results. What do you notice?

In [None]:
# quantile transformer
qt = sklearn.preprocessing.QuantileTransformer()

train_quantile_scaled = qt.fit_transform(train)
validate_quantile_scaled = qt.transform(validate)
test_quantile_scaled = qt.transform(test)


In [None]:
train_quantile_scaled = pd.DataFrame(train_quantile_scaled)

In [None]:
# visualize the scaled data
for col in train_quantile_scaled.columns:
    plt.figure()
    plt.hist(train_quantile_scaled[col])

## 5. Based on the work you've done, choose a scaling method for your dataset. Write a function within your prepare.py that accepts as input the train, validate, and test data splits, and returns the scaled versions of each. Be sure to only learn the parameters for scaling from your training data!

In [None]:
train.head()

In [None]:
def scale_data(train, 
               validate, 
               test, 
               to_scale):
    #duplicate data for scaling
    train_scaled = train.copy()
    validate_scaled = test.copy()
    test_scaled = test.copy()

    #Make Scaler MinMax
    scaler = MinMaxScaler()

    #Fit Scaler
    scaler.fit(train[to_scale])

    #Use Scaler
    train_scaled[to_scale] = scaler.transform(train[to_scale])
    validate_scaled[to_scale] = scaler.transform(validate[to_scale])
    test_scaled[to_scale] = scaler.transform(test[to_scale])
    
    return train_scaled, validate_scaled, test_scaled

In [None]:
#check data
train.head()

In [None]:
train_scaled.head()

# Instructor functions for scaling 
Below are visualize_scaler functions from Instructor. Cleaner approach to visualizations will be used for final project.

In [None]:
to_scale = ['bedrooms','bathrooms','sqft','year_built','sale_tax']

In [None]:
def visualize_scaler(scaler, df, columns_to_scale, bins=10):
    #create subplot structure
    fig, axs = plt.subplots(len(columns_to_scale), 2, figsize=(12,12))

    #copy the df for scaling
    df_scaled = df.copy()
    
    #fit and transform the df
    df_scaled[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    #plot the pre-scaled data next to the post-scaled data in one row of a subplot
    for (ax1, ax2), col in zip(axs, columns_to_scale):
        ax1.hist(df[col], bins=bins)
        ax1.set(title=f'{col} before scaling', xlabel=col, ylabel='count')
        ax2.hist(df_scaled[col], bins=bins)
        ax2.set(title=f'{col} after scaling with {scaler.__class__.__name__}', xlabel=col, ylabel='count')
    plt.tight_layout()

In [None]:
# call function with minmax
visualize_scaler(scaler=MinMaxScaler(), 
                 df=train, 
                 columns_to_scale=to_scale, 
                 bins=50)

In [None]:
# call function with standard scaler
visualize_scaler(scaler=StandardScaler(), 
                 df=train, 
                 columns_to_scale=to_scale, 
                 bins=50)


In [None]:
# call function with robustscaler
visualize_scaler(scaler=RobustScaler(), 
                 df=train, 
                 columns_to_scale=to_scale, 
                 bins=50)


In [None]:
# call function using QuantileTransformer
visualize_scaler(scaler=QuantileTransformer(output_distribution='normal'), 
                 df=train,
                 columns_to_scale=to_scale, 
                 bins=50)

In [None]:
# call function using QuantileTransformer
visualize_scaler(scaler=QuantileTransformer(), 
                 df=train,
                 columns_to_scale=to_scale, 
                 bins=50)
