In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
import acquire

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
zillow = acquire.get_zillow_data()

In [3]:
# Get a peak of the dataframe
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77574 entries, 0 to 77573
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77574 non-null  int64  
 1   parcelid                      77574 non-null  int64  
 2   airconditioningtypeid         25006 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77574 non-null  float64
 6   bedroomcnt                    77574 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49808 non-null  float64
 9   calculatedbathnbr             76959 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6035 non-null   float64
 12  calculatedfinishedsquarefeet  77374 non-null  float64
 13  f

In [4]:
# Create a function that will remove rows and columns that have missing values past a certain threshold.
def handle_missing_values(df, p_row = 0.54, p_col = 0.54):
    ''' function which takes in a dataframe, required notnull proportions of non-null rows and columns.
    drop the columns and rows columns based on theshold:'''
    
    #drop columns with nulls
    threshold = int(p_col * len(df.index)) # Require that many non-NA values.
    df.dropna(axis = 1, thresh = threshold, inplace = True)
    
    #drop rows with nulls
    threshold = int(p_row * len(df.columns)) # Require that many non-NA values.
    df.dropna(axis = 0, thresh = threshold, inplace = True)
    
    
    return df

In [5]:
b = handle_missing_values(zillow)

In [6]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77574 entries, 0 to 77573
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77574 non-null  int64  
 1   parcelid                      77574 non-null  int64  
 2   bathroomcnt                   77574 non-null  float64
 3   bedroomcnt                    77574 non-null  float64
 4   buildingqualitytypeid         49808 non-null  float64
 5   calculatedbathnbr             76959 non-null  float64
 6   calculatedfinishedsquarefeet  77374 non-null  float64
 7   finishedsquarefeet12          73919 non-null  float64
 8   fips                          77574 non-null  float64
 9   fullbathcnt                   76959 non-null  float64
 10  heatingorsystemtypeid         49569 non-null  float64
 11  latitude                      77574 non-null  float64
 12  longitude                     77574 non-null  float64
 13  l

In [7]:
# Take a look at the amount of missing values.
b.isnull().sum()

id                                  0
parcelid                            0
bathroomcnt                         0
bedroomcnt                          0
buildingqualitytypeid           27766
calculatedbathnbr                 615
calculatedfinishedsquarefeet      200
finishedsquarefeet12             3655
fips                                0
fullbathcnt                       615
heatingorsystemtypeid           28005
latitude                            0
longitude                           0
lotsizesquarefeet                8257
propertycountylandusecode           0
propertylandusetypeid               0
propertyzoningdesc              27100
rawcensustractandblock              0
regionidcity                     1472
regionidcounty                      0
regionidzip                        50
roomcnt                             0
unitcnt                         26872
yearbuilt                         269
structuretaxvaluedollarcnt        115
taxvaluedollarcnt                   1
assessmentye

In [8]:
# Create a list of columns to drop.
columns_to_drop = ['calculatedbathnbr','calculatedfinishedsquarefeet','finishedsquarefeet12','fullbathcnt','propertycountylandusecode','propertylandusetypeid','rawcensustractandblock','regionidcity','regionidcounty','regionidzip','structuretaxvaluedollarcnt','censustractandblock','propertylandusedesc']

In [9]:
def drop_columns(df, drop_col):
    df = df.drop(columns=drop_col)
    return df

In [10]:
b = drop_columns(b, columns_to_drop)

In [11]:
b.isna().sum()

id                           0
parcelid                     0
bathroomcnt                  0
bedroomcnt                   0
buildingqualitytypeid    27766
fips                         0
heatingorsystemtypeid    28005
latitude                     0
longitude                    0
lotsizesquarefeet         8257
propertyzoningdesc       27100
roomcnt                      0
unitcnt                  26872
yearbuilt                  269
taxvaluedollarcnt            1
assessmentyear               0
landtaxvaluedollarcnt        2
taxamount                    5
logerror                     0
transactiondate              0
heatingorsystemdesc      28005
dtype: int64

In [12]:
for col in b.columns:
    if b[col].isna().sum() > 0:
        b[col] = b[col].fillna(value = b[col].mean())
        print(b[col])

0        6.53383
1        6.53383
2        6.53383
3        8.00000
4        8.00000
          ...   
77569    8.00000
77570    6.00000
77571    6.53383
77572    4.00000
77573    6.00000
Name: buildingqualitytypeid, Length: 77574, dtype: float64
0        3.921645
1        3.921645
2        3.921645
3        2.000000
4        2.000000
           ...   
77569    2.000000
77570    2.000000
77571    3.921645
77572    2.000000
77573    2.000000
Name: heatingorsystemtypeid, Length: 77574, dtype: float64
0          4506.0
1         12647.0
2          8432.0
3         13038.0
4        278581.0
           ...   
77569     59487.0
77570     47405.0
77571     12105.0
77572      5074.0
77573      6347.0
Name: lotsizesquarefeet, Length: 77574, dtype: float64


TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
b

This is a linear regression problem. I must create a model that will properly predict the logerror the zestimate will return.



# Create a function that will take in a list of column names as an argument.
-> The function will iterate through the list and return a list of every possible triplet combination.
-> The function will then iterate through each combination and use inertia to determine the optimal k value for centroids.  
-> Use the k value to create a column that stores the predicted values of that cluster and append it to the X_train_scaled dataframe.
-> Now run the recursive function elimination with all the new features added and go with the top 10.



In [13]:
import math

In [None]:
def create_max_combo_triplet(features):
    triplets = list(combinations(features,3))
    return triplets


def find_k(X_train, cluster_vars, k_range):
    sse = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k)

        # X[0] is our X_train dataframe..the first dataframe in the list of dataframes stored in X. 
        kmeans.fit(X_train[cluster_vars])

        # inertia: Sum of squared distances of samples to their closest cluster center.
        sse.append(kmeans.inertia_) 

    # compute the difference from one k to the next
    delta = [round(sse[i] - sse[i+1],0) for i in range(len(sse)-1)]

    # compute the percent difference from one k to the next
    pct_delta = [round(((sse[i] - sse[i+1])/sse[i])*100, 1) for i in range(len(sse)-1)]

    # create a dataframe with all of our metrics to compare them across values of k: SSE, delta, pct_delta
    k_comparisons_df = pd.DataFrame(dict(k=k_range[0:-1], 
                             sse=sse[0:-1], 
                             delta=delta, 
                             pct_delta=pct_delta))

    # plot k with inertia
    plt.plot(k_comparisons_df.k, k_comparisons_df.sse, 'bx-')
    plt.xlabel('k')
    plt.ylabel('SSE')
    plt.title('The Elbow Method to find the optimal k\nFor which k values do we see large decreases in SSE?')
    plt.show()

    # plot k with pct_delta
    plt.plot(k_comparisons_df.k, k_comparisons_df.pct_delta, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Percent Change')
    plt.title('For which k values are we seeing increased changes (%) in SSE?')
    plt.show()

    # plot k with delta
    plt.plot(k_comparisons_df.k, k_comparisons_df.delta, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Absolute Change in SSE')
    plt.title('For which k values are we seeing increased changes (absolute) in SSE?')
    plt.show()

    return k_comparisons_df


def create_clusters(X_train, k, cluster_vars):
    # create kmean object
    kmeans = KMeans(n_clusters=k, random_state = 13)

    # fit to train and assign cluster ids to observations
    kmeans.fit(X_train[cluster_vars])

    return kmeans


# get the centroids for each distinct cluster...

def get_centroids(kmeans, cluster_vars, cluster_name):
    # get the centroids for each distinct cluster...

    centroid_col_names = ['centroid_' + i for i in cluster_vars]

    centroid_df = pd.DataFrame(kmeans.cluster_centers_, 
                               columns=centroid_col_names).reset_index().rename(columns={'index': cluster_name})

    return centroid_df


# label cluster for each observation in X_train (X[0] in our X list of dataframes), 
# X_validate (X[1]), & X_test (X[2])

def assign_clusters(kmeans, cluster_vars, cluster_name, centroid_df):
    for i in range(len(X)):
        clusters = pd.DataFrame(kmeans.predict(X[i][cluster_vars]), 
                            columns=[cluster_name], index=X[i].index)

        clusters_centroids = clusters.merge(centroid_df, on=cluster_name, copy=False).set_index(clusters.index.values)

        X[i] = pd.concat([X[i], clusters_centroids], axis=1)
    return X





In [None]:
x

In [18]:
list_of_first_ten_features

NameError: name 'list_of_first_ten_features' is not defined