In [1]:
#Import libraries
# Standard Packages
import pandas as pd
import numpy as np

# Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt

# Scipy Stats
import scipy.stats as stats 

# Statsmodel Api
import statsmodels.api as sm
from statsmodels.formula.api import ols

# SKLearn Modules
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

# Suppress future and deprecation warnings
import warnings
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
#Import housing data
kc_house_df = pd.read_csv('/Users/Aidan/Documents/Flatiron/Phase_2/King-County-House-Sales-/data/kc_house_data.csv')

kc_house_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long
0,7399300360,5/24/2022,675000.0,4,1.0,1180,7140,1.0,NO,NO,...,PUBLIC,1180,0,0,40,1969,0,"2102 Southeast 21st Court, Renton, Washington ...",47.461975,-122.19052
1,8910500230,12/13/2021,920000.0,5,2.5,2770,6703,1.0,NO,NO,...,PUBLIC,1570,1570,0,240,1950,0,"11231 Greenwood Avenue North, Seattle, Washing...",47.711525,-122.35591
2,1180000275,9/29/2021,311000.0,6,2.0,2880,6156,1.0,NO,NO,...,PUBLIC,1580,1580,0,0,1956,0,"8504 South 113th Street, Seattle, Washington 9...",47.502045,-122.2252
3,1604601802,12/14/2021,775000.0,3,3.0,2160,1400,2.0,NO,NO,...,PUBLIC,1090,1070,200,270,2010,0,"4079 Letitia Avenue South, Seattle, Washington...",47.56611,-122.2902
4,8562780790,8/24/2021,592500.0,2,2.0,1120,758,2.0,NO,NO,...,PUBLIC,1120,550,550,30,2012,0,"2193 Northwest Talus Drive, Issaquah, Washingt...",47.53247,-122.07188


In [3]:
#Look at columns
kc_house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30155 entries, 0 to 30154
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             30155 non-null  int64  
 1   date           30155 non-null  object 
 2   price          30155 non-null  float64
 3   bedrooms       30155 non-null  int64  
 4   bathrooms      30155 non-null  float64
 5   sqft_living    30155 non-null  int64  
 6   sqft_lot       30155 non-null  int64  
 7   floors         30155 non-null  float64
 8   waterfront     30155 non-null  object 
 9   greenbelt      30155 non-null  object 
 10  nuisance       30155 non-null  object 
 11  view           30155 non-null  object 
 12  condition      30155 non-null  object 
 13  grade          30155 non-null  object 
 14  heat_source    30123 non-null  object 
 15  sewer_system   30141 non-null  object 
 16  sqft_above     30155 non-null  int64  
 17  sqft_basement  30155 non-null  int64  
 18  sqft_g

In [4]:
#Looking at bedrooms
kc_house_df.bedrooms.value_counts()

3     12754
4      9597
2      3936
5      2798
6       498
1       391
7        80
0        44
8        38
9        14
10        3
13        1
11        1
Name: bedrooms, dtype: int64

In [5]:
#Looking at rows where bedroom is zero. Looks like 0 is a placeholder for missing values.
kc_house_df[kc_house_df.bedrooms == 0].head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long
573,3920030050,5/19/2022,930000.0,0,0.0,1617,2156,3.0,NO,NO,...,PUBLIC,2156,0,0,0,2009,0,"6019 Roosevelt Way Northeast, Seattle, Washing...",47.6727,-122.31781
1289,2768301406,3/2/2022,1090000.0,0,0.0,1500,1262,3.0,NO,NO,...,PUBLIC,1500,0,0,0,2021,0,"A, Leadwood, Missouri 63653, United States",37.85979,-90.58113
1310,3462800015,11/10/2021,360000.0,0,0.0,910,19000,1.0,NO,NO,...,PRIVATE,910,0,0,250,1946,0,"26125 79th Avenue South, Kent, Washington 9803...",47.366765,-122.23543
1952,2020069042,9/27/2021,399990.0,0,0.0,1677,43264,1.0,NO,NO,...,PUBLIC,1677,0,0,0,1969,0,"43407 212th Avenue Southeast, Enumclaw, Washin...",47.21166,-122.0592
2044,6896300047,9/17/2021,509000.0,0,1.0,400,2385,1.0,NO,NO,...,PUBLIC,400,0,0,0,1946,0,"8416 B Island Dr S, Seattle, Washington 98118,...",47.52803,-122.26129


Overview
A one-paragraph overview of the project, including the business problem, data, methods, results and recommendations.



Business Problem
Summary of the business problem you are trying to solve, and the data questions that you plan to answer to solve them.

Questions to consider:

Who are your stakeholders?
What are your stakeholders' pain points related to this project?
Why are your predictions important from a business perspective?


Data Understanding
Describe the data being used for this project.

- King County, WA housing data
    - min 4+ bedrooms (2 single parents and their kids)
    - near green space (greenbelt feature only has 773 yes)
    - nuisance =  No has ~ 25,000
    - 
- School district data?
- Crime statistics by zip code?

Questions to consider:

Where did the data come from, and how do they relate to the data analysis questions?
What do the data represent? Who is in the sample and what variables are included?
What is the target variable?
What are the properties of the variables you intend to use?

In [6]:
#Import data
schools_df = pd.read_csv('/Users/Aidan/Documents/Flatiron/Phase_2/King-County-House-Sales-/data/Report_Card_Assessment_Data_2021-22_School_Year.csv')

schools_df.head(100)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,SchoolYear,OrganizationLevel,County,ESDName,ESDOrganizationId,DistrictCode,DistrictName,DistrictOrganizationId,SchoolCode,SchoolName,...,Count of students expected to test including previously passed,CountMetStandard,PercentMetStandard,PercentLevel1,PercentLevel2,PercentLevel3,PercentLevel4,PercentMetTestedOnly,PercentNoScore,DataAsOf
0,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,176.0,54.0,30.7%,0.460227,0.221591,0.198864,0.107955,0.310345,0.011364,9/7/2022
1,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,107.0,35.0,32.7%,0.420561,0.224299,0.196262,0.130841,0.336538,0.028037,9/7/2022
2,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,111.0,39.0,35.1%,0.441441,0.198198,0.198198,0.153153,0.354545,0.009009,9/7/2022
3,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
4,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2021-22,District,Lewis,Capital Region ESD 113,100004.0,21226.0,Adna School District,100011.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
96,2021-22,District,Lewis,Capital Region ESD 113,100004.0,21226.0,Adna School District,100011.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
97,2021-22,District,Lewis,Capital Region ESD 113,100004.0,21226.0,Adna School District,100011.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
98,2021-22,District,Lewis,Capital Region ESD 113,100004.0,21226.0,Adna School District,100011.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022


In [7]:
king_school_df = schools_df[schools_df.County == "King"]

king_school_df.head()

Unnamed: 0,SchoolYear,OrganizationLevel,County,ESDName,ESDOrganizationId,DistrictCode,DistrictName,DistrictOrganizationId,SchoolCode,SchoolName,...,Count of students expected to test including previously passed,CountMetStandard,PercentMetStandard,PercentLevel1,PercentLevel2,PercentLevel3,PercentLevel4,PercentMetTestedOnly,PercentNoScore,DataAsOf
311,2021-22,District,King,Puget Sound Educational Service District 121,100006.0,17408.0,Auburn School District,100016.0,,District Total,...,1234.0,582.0,47.2%,0.306321,0.188817,0.26094,0.210697,0.487846,0.033225,9/7/2022
312,2021-22,District,King,Puget Sound Educational Service District 121,100006.0,17408.0,Auburn School District,100016.0,,District Total,...,299.0,46.0,15.4%,0.632107,0.180602,0.140468,0.013378,0.15917,0.033445,9/7/2022
313,2021-22,District,King,Puget Sound Educational Service District 121,100006.0,17408.0,Auburn School District,100016.0,,District Total,...,935.0,536.0,57.3%,0.202139,0.191444,0.299465,0.273797,0.59292,0.033155,9/7/2022
314,2021-22,District,King,Puget Sound Educational Service District 121,100006.0,17408.0,Auburn School District,100016.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
315,2021-22,District,King,Puget Sound Educational Service District 121,100006.0,17408.0,Auburn School District,100016.0,,District Total,...,407.0,271.0,66.6%,0.144963,0.144963,0.307125,0.358722,0.696658,0.044226,9/7/2022


In [8]:
ex_df = king_school_df[king_school_df['DistrictName'] == 'Auburn School District']

In [9]:
ex_df.drop(columns = ['SchoolYear','OrganizationLevel','DataAsOf', 'County','ESDOrganizationId','SchoolCode',\
                     'DistrictOrganizationId', 'TestAdministration','CurrentSchoolType', 'SchoolOrganizationId']\
           , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [13]:
ex_df[(ex_df.StudentGroup == "All Students") & (ex_df.GradeLevel == "All Grades")]

Unnamed: 0,ESDName,DistrictCode,DistrictName,SchoolName,StudentGroupType,StudentGroup,GradeLevel,TestSubject,Suppression,Count of Students Expected to Test,Count of students expected to test including previously passed,CountMetStandard,PercentMetStandard,PercentLevel1,PercentLevel2,PercentLevel3,PercentLevel4,PercentMetTestedOnly,PercentNoScore
18263,Puget Sound Educational Service District 121,17408.0,Auburn School District,Terminal Park Elementary School,All,All Students,All Grades,Science,,99.0,99.0,51.0,51.5%,0.303030,0.181818,0.191919,0.323232,0.515152,0.000000
18456,Puget Sound Educational Service District 121,17408.0,Auburn School District,Hazelwood Elementary School,All,All Students,All Grades,Science,,67.0,67.0,40.0,59.7%,0.208955,0.179104,0.477612,0.119403,0.606061,0.014925
18503,Puget Sound Educational Service District 121,17408.0,Auburn School District,Ilalko Elementary School,All,All Students,All Grades,ELA,,233.0,233.0,107.0,45.9%,0.330472,0.201717,0.236052,0.223176,0.463203,0.008584
18521,Puget Sound Educational Service District 121,17408.0,Auburn School District,Ilalko Elementary School,All,All Students,All Grades,Math,,233.0,233.0,104.0,44.6%,0.283262,0.261803,0.236052,0.210300,0.450216,0.008584
18529,Puget Sound Educational Service District 121,17408.0,Auburn School District,Auburn Riverside High School,All,All Students,All Grades,ELA,N<10,,,,Suppressed: N<10,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735906,Puget Sound Educational Service District 121,17408.0,Auburn School District,District Total,All,All Students,All Grades,ELA,,8799.0,8799.0,3839.0,43.6%,0.297079,0.219116,0.262984,0.173315,0.458060,0.047505
735909,Puget Sound Educational Service District 121,17408.0,Auburn School District,District Total,All,All Students,All Grades,Math,,8838.0,8838.0,2673.0,30.2%,0.399977,0.243607,0.169156,0.133288,0.319699,0.053971
735911,Puget Sound Educational Service District 121,17408.0,Auburn School District,District Total,All,All Students,All Grades,Science,,3810.0,3810.0,1339.0,35.1%,0.338320,0.205774,0.259318,0.092126,0.392438,0.104462
745266,Puget Sound Educational Service District 121,17408.0,Auburn School District,Bowman Creek Elementary,All,All Students,All Grades,Math,N<10,,,,Suppressed: N<10,,,,,,


In [None]:
king_school_df.DistrictName.value_counts()

In [None]:
len(king_school_df)

In [None]:
king_school_df.columns

In [None]:
king_school_df.StudentGroup.value_counts()

In [None]:
subset_king_school_df = king_school_df[['DistrictName','SchoolName', 'StudentGroup','PercentMetStandard']]

In [None]:
subset_king_school_df[king_school_df['StudentGroup'] == "All Students"]

## Data Preparation and Cleaning

### Housing Data Set

In [None]:
#A reminder of what our data looks like
kc_house_df.head()

Let's create the subset of interest to our stakeholders:
- min 4+ bedrooms (2 single mothers and their kids)
- nuisance == No

In [None]:
#Let's create a subset that fit the above criteria
kc_family_house_df = kc_house_df[(kc_house_df['bedrooms'] >= 4) & (kc_house_df['nuisance'] == "NO")]

#Reset the index
kc_family_house_df.reset_index(drop=True,inplace= True)

#Preview first 5 rows of subset
kc_family_house_df.head()

Let's split the address into two new columns- city and zip code.

In [None]:
# Example of an address
kc_family_house_df['address'][0]

In [None]:
#Separate the Zip Code from the address
kc_family_house_df["Zip Code"] = kc_family_house_df['address'].apply(lambda x: x.split()[-3][:-1])

In [None]:
#Preview our new column
kc_family_house_df["Zip Code"].head()

In [None]:
kc_family_house_df['address'][2]

In [None]:
kc_family_house_df.head()

## SCHOOL DISTRICT ZIP CODES

Information taken from https://statisticalatlas.com/county/Washington/King-County/Overview

In [None]:
King_County_WA_zipcodes = [98001, 98002, 98003, 98004, 98005, 98006, 98007, 98008, 98010, \
    98011, 98014, 98019, 98021, 98022, 98023, 98024, 98027, 98028, \
    98029, 98030, 98031, 98032, 98033, 98034, 98038, 98039, 98040, \
    98042, 98043, 98045, 98047, 98050, 98051, 98052, 98053, 98055, \
    98056, 98057, 98058, 98059, 98065, 98068, 98070, 98072, 98074, \
    98075, 98077, 98092, 98101, 98102, 98103, 98104, 98105, 98106, \
    98107, 98108, 98109, 98112, 98115, 98116, 98117, 98118, 98119, \
    98121, 98122, 98125, 98126, 98133, 98134, 98136, 98144, 98146, \
    98148, 98154, 98155, 98158, 98164, 98166, 98168, 98174, 98177, \
    98178, 98188, 98195, 98198, 98199, 98224, 98288, 98323, 98354, 98391]

In [None]:
Seattle_School_District_zipcodes = [98101, 98102, 98103, 98104, 98105, 98106, 98107, 98108,\
                                    98109, 98112, 98115, 98116, 98117, 98118, 98119, 98121,\
                                    98122, 98125, 98126, 98133, 98134, 98136, 98144, 98146,\
                                    98154, 98164, 98168, 98174, 98177, 98178, 98195, 98199]  

Lake_Washington_School District_zipcodes = [98004, 98005, 98007, 98008, 98011, 98033, 98034,\
                                            98039, 98052, 98053, 98072, 98074, 98075, 98077]

Kent_School_District_zipcodes = [98001, 98002, 98010, 98030, 98031, 98032, 98038, 98042, 98055,\
                                 98058, 98092, 98188, 98198]

Federal_Way_School_District_zipcodes = [98001, 98003, 98023, 98032, 98198, 98354]

Highline_School_District_zipcodes = [98032, 98106, 98108, 98126, 98146, 98148, 98158, 98166, 98168,\
                                     98188, 98198]

Northshore_School_District_zipcodes = [98011, 98012, 98021, 98028, 98034, 98036, 98052, 98072, 98077,\
                                       98155, 98296]

Renton School_District_zipcodes = [98006, 98031, 98032, 98055, 98056, 98057, 98058, 98059, 98168, 98178, 98188]

Bellevue_School_District_zipcodes = [98004, 98005, 98006, 98007, 98008, 98027, 98033, 98039, 98052, 98056, 98059]

Auburn_School_District_zipcodes = [98001, 98002, 98010, 98022, 98030, 98032, 98042, 98047, 98092, 98391]

Issaquah_School_District_zipcodes = [98006, 98008, 98024, 98027, 98029, 98038, 98050, 98056, 98058, 98059,\
                                     98065, 98074, 98075]

Shoreline_School_District_zipcodes = [98043, 98133, 98155, 98177]

Snoqualmie_Valley_School_District_zipcodes = [98014, 98024, 98027, 98038, 98045, 98053, 98065, 98068, 98074,\
                                              98075, 98224]

Tahoma_School_District_zipcodes = [98010, 98027, 98038, 98042, 98045, 98051, 98058, 98059, 98065]

Enumclaw_School_District_zipcodes = [98010, 98022, 98038, 98045, 98051, 98092, 98323, 98391]

Tukwila_School_District_zipcodes = [98168, 98178, 98188]

Riverview_School_District_zipcodes = [98014, 98019, 98024, 98053, 98065, 98077, 98224]

Mercer_Island_School_District_zipcodes = [98004, 98006, 98040]

Vashon_Island_School_District_zipcodes = [98070]

Skykomish_School_District_zipcodes = [98045, 98068, 98224, 98288]

Fife_School_District_zipcodes = [98001, 98003, 98354, 98371, 98372, 98421, 98422, 98424]

Not included in School Districts:<br>
- Rainier_Prep_Charter_School_District_zipcodes  
- Summit_Public_School_Atlas_zipcodes  
- Rainier_Valley_Leadership_Academy_zipcodes  
- Muckleshoot_Indian_Tribe_zipcodes  
- Impact_Puget_Sound _Elementary_zipcodes  
- Summit_Public_School_Sierra_zipcodes  
- Renton_Technical_College_zipcodes  
- Lake_Washington_Institute_of_Technology_zipcodes  