# EXPLORATORY NOTEBOOK

In [1]:
#python libraries
import pandas as pd
import numpy as np
import os

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns 
#import 
from env import hostname, user, password
import wrangle as w
import explore as e

#Import scikit-learn 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler,QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



#turn of warnings 
import warnings
warnings.filterwarnings("ignore")

## ACQUIRE DATA

In [None]:
#Acquire Zillow_2017 data and query necessary columns
df = w.get_zillow_data()
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

## PREPARE DATA

In [None]:
# Prepared data by renamed columns to enhance readability. Dropped null values & duplicates.
df = w.prep_zillow(df)
df.head()

In [None]:
df.info()


In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.bedrooms.value_counts()

In [None]:
#master clean renames fips to County, removes outliers for bedrooms, bathrooms, and sqft columns & changes data types integers
df = w.master_clean_zillow(df)
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.bedrooms.value_counts()

In [None]:
df.bathrooms.value_counts()

In [None]:
df.lot_size.describe()

In [None]:
sns.scatterplot(x= 'lot_size', y="tax_value", data=df, hue = 'county')

In [None]:
df.sqft.describe()

#### <span style= 'color:green'> Outliers are sale_tax $40+ and any sqft above 8K

In [None]:
sns.scatterplot(x= 'sqft', y="tax_value", data=df, hue = 'county')

In [None]:
# Clean data devided by original data = percent of original data maintained after preparation and cleaning process 
50282/52441

## SPLIT DATA

In [None]:
#Data split into train, validate, test
train, validate, test = w.split_zillow(df)

In [None]:
train.dtypes

In [None]:
train.head()

In [None]:
# Datasets split into  50%, 30%, and 20% data previously cleaned so no missed values nulls and outliers removed
w.print_train(train, validate, test)

## Exploratory Analysis

In [None]:
# Pairplot of features to determine which ones will best represent the data and questions asked?
sns.pairplot(train, hue = 'county')
plt.title('Pairplot of Zillow features')
plt.show()

In [None]:
#Visualizing correlation data with Heat Map
plt.figure(figsize=(25,20))
sns.heatmap(train.corr(), cmap='Blues', center=0, annot=True)

plt.show()

In [None]:
#Explore correlation data by churn
train_correlation = train.corr()
train_correlation

In [None]:
#This is a scatter plot that shows How the counties appear on a map using longitude and latitutde lines 
def california_county(train):
    fig, ax = plt.subplots(figsize = (7,5))
    sns.scatterplot(data=train,x=train['longitude'],
                y=train['latitude'], zorder=1,hue='county')
plt.show()

In [None]:
california_county(train)

In [None]:
to_scale = ['bedrooms', 'bathrooms', 'sqft', 'lot_size', 'longitude','latitude']

In [None]:
def visualize_scaler(scaler, df, features_to_scale, bins=10):
    #create subplot structure
    fig, axs = plt.subplots(len(features_to_scale), 2, figsize=(12,12))

    #copy the df for scaling
    df_scaled = df.copy()
    
    #fit and transform the df
    df_scaled[features_to_scale] = scaler.fit_transform(df[features_to_scale])

    #plot the pre-scaled data next to the post-scaled data in one row of a subplot
    for (ax1, ax2), feature in zip(axs, features_to_scale):
        ax1.hist(df[feature], bins=bins)
        ax1.set(title=f'{feature} before scaling', xlabel=feature, ylabel='count')
        ax2.hist(df_scaled[feature], bins=bins)
        ax2.set(title=f'{feature} after scaling with {scaler.feature}', xlabel=feature, ylabel='count')
    plt.tight_layout()

In [None]:
# call function with minmax
visualize_scaler(scaler=MinMaxScaler(), 
                 df=train, 
                 features_to_scale=to_scale, 
                 bins=50)

In [None]:
# call function with standard scaler
visualize_scaler(scaler=StandardScaler(), 
                 df=train, 
                 features_to_scale=to_scale, 
                 bins=50)

In [None]:
# call function with robustscaler
visualize_scaler(scaler=RobustScaler(), 
                 df=train, 
                 features_to_scale=to_scale, 
                 bins=50)

In [None]:
# call function using QuantileTransformer
visualize_scaler(scaler=QuantileTransformer(), 
                 df=train,
                 features_to_scale=to_scale, 
                 bins=50)

In [None]:
# train_county = train.county
# validate_county = validate.county
# test_county = test.county


In [None]:
# train, validate, test = w.scaled_data(train, validate, test)

In [None]:
X_train, y_train, X_validate,y_validate,X_test,y_test = w.x_y_split(train, validate, test)

In [None]:
X_train.head()

In [None]:
def plot_variable_pairs():
    train_sample = train.sample(n=1000)
    features = ['bedrooms', 'bathrooms', 'sqft', 'lot_size', 'longitude','latitude']
    for feature in features:
        columns = train.select_dtypes('int')
    for feature in features:
        sns.lmplot(x= feature, y="tax_value", data=train, featue = 'county', hue = 'county', line_kws={'color': 'red'})
plot_variable_pairs()

In [None]:
def plot_categorical_and_continuous_vars():
    train_sample = train.sample(n=1000)
    features =  train.select_dtypes('int') #['bedrooms', 'bathrooms', 'sqft', 'lot_size', 'longitude','latitude']
    for feature in features:
        sns.set(rc={'figure.figsize':(20,10)})
        fig, axes = plt.subplots(2,2)
        sns.boxplot(x= feature, y="tax_value", data=train, hue = 'county')axs[0,0].set_title('boxplot')
        sns.barplot(x= feature, y="tax_value", data=train, hue = 'county')axs[0,1].set_title('barplot')
        sns.violinplot(x= feature, y="tax_value", data=train, hue = 'county')axs[1,0].set_title('violinplot')
        sns.scatterplot(x= feature, y="tax_value", data=train, hue = 'county')axs[1,1].set_title('scatterplot')
plot_categorical_and_continuous_vars()

# ax = axs[0,0]
# , ax = axs[0,1]
# , ax = axs[1,0]
# , ax = axes[1,1]

In [None]:
plot_categorical_and_continuous_vars()