In [1]:
# Standard Packages
import pandas as pd
import numpy as np

# Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Scipy Stats
import scipy.stats as stats 

# Statsmodel Api
import statsmodels.api as sm
from statsmodels.formula.api import ols

# SKLearn Modules
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [2]:
# Opening up our Kings County Housing Prices dataset
df = pd.read_csv('/Users/aheinke/Documents/Flatiron/NYC-DS-010923/Phase_2/Phase2_Final_Proj/kc_house_data.csv')
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long
0,7399300360,5/24/2022,675000.0,4,1.0,1180,7140,1.0,NO,NO,...,PUBLIC,1180,0,0,40,1969,0,"2102 Southeast 21st Court, Renton, Washington ...",47.461975,-122.19052
1,8910500230,12/13/2021,920000.0,5,2.5,2770,6703,1.0,NO,NO,...,PUBLIC,1570,1570,0,240,1950,0,"11231 Greenwood Avenue North, Seattle, Washing...",47.711525,-122.35591
2,1180000275,9/29/2021,311000.0,6,2.0,2880,6156,1.0,NO,NO,...,PUBLIC,1580,1580,0,0,1956,0,"8504 South 113th Street, Seattle, Washington 9...",47.502045,-122.22520
3,1604601802,12/14/2021,775000.0,3,3.0,2160,1400,2.0,NO,NO,...,PUBLIC,1090,1070,200,270,2010,0,"4079 Letitia Avenue South, Seattle, Washington...",47.566110,-122.29020
4,8562780790,8/24/2021,592500.0,2,2.0,1120,758,2.0,NO,NO,...,PUBLIC,1120,550,550,30,2012,0,"2193 Northwest Talus Drive, Issaquah, Washingt...",47.532470,-122.07188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30150,7834800180,11/30/2021,1555000.0,5,2.0,1910,4000,1.5,NO,NO,...,PUBLIC,1600,1130,0,210,1921,0,"4673 Eastern Avenue North, Seattle, Washington...",47.664740,-122.32940
30151,194000695,6/16/2021,1313000.0,3,2.0,2020,5800,2.0,NO,NO,...,PUBLIC,2020,0,0,520,2011,0,"4131 44th Avenue Southwest, Seattle, Washingto...",47.565610,-122.38851
30152,7960100080,5/27/2022,800000.0,3,2.0,1620,3600,1.0,NO,NO,...,PUBLIC,940,920,240,110,1995,0,"910 Martin Luther King Jr Way, Seattle, Washin...",47.610395,-122.29585
30153,2781280080,2/24/2022,775000.0,3,2.5,2570,2889,2.0,NO,NO,...,PUBLIC,1830,740,480,100,2006,0,"17127 114th Avenue Southeast, Renton, Washingt...",47.449490,-122.18908


In [3]:
# Taking a look at 
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,lat,long
count,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0,30155.0
mean,4538104000.0,1108536.0,3.41353,2.334737,2112.424739,16723.6,1.543492,1809.826098,476.039396,330.211142,217.412038,1975.163953,90.922301,47.328076,-121.317397
std,2882587000.0,896385.7,0.981612,0.889556,974.044318,60382.6,0.567717,878.306131,579.631302,285.770536,245.302792,32.067362,416.473038,1.434005,5.725475
min,1000055.0,27360.0,0.0,0.0,3.0,402.0,1.0,2.0,0.0,0.0,0.0,1900.0,0.0,21.27424,-157.79148
25%,2064175000.0,648000.0,3.0,2.0,1420.0,4850.0,1.0,1180.0,0.0,0.0,40.0,1953.0,0.0,47.40532,-122.326045
50%,3874011000.0,860000.0,3.0,2.5,1920.0,7480.0,1.5,1560.0,0.0,400.0,150.0,1977.0,0.0,47.55138,-122.225585
75%,7287100000.0,1300000.0,4.0,3.0,2619.5,10579.0,2.0,2270.0,940.0,510.0,320.0,2003.0,0.0,47.669913,-122.116205
max,9904000000.0,30750000.0,13.0,10.5,15360.0,3253932.0,4.0,12660.0,8020.0,3580.0,4370.0,2022.0,2022.0,64.82407,-70.07434


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30155 entries, 0 to 30154
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             30155 non-null  int64  
 1   date           30155 non-null  object 
 2   price          30155 non-null  float64
 3   bedrooms       30155 non-null  int64  
 4   bathrooms      30155 non-null  float64
 5   sqft_living    30155 non-null  int64  
 6   sqft_lot       30155 non-null  int64  
 7   floors         30155 non-null  float64
 8   waterfront     30155 non-null  object 
 9   greenbelt      30155 non-null  object 
 10  nuisance       30155 non-null  object 
 11  view           30155 non-null  object 
 12  condition      30155 non-null  object 
 13  grade          30155 non-null  object 
 14  heat_source    30123 non-null  object 
 15  sewer_system   30141 non-null  object 
 16  sqft_above     30155 non-null  int64  
 17  sqft_basement  30155 non-null  int64  
 18  sqft_g

In [5]:
df['address']

0        2102 Southeast 21st Court, Renton, Washington ...
1        11231 Greenwood Avenue North, Seattle, Washing...
2        8504 South 113th Street, Seattle, Washington 9...
3        4079 Letitia Avenue South, Seattle, Washington...
4        2193 Northwest Talus Drive, Issaquah, Washingt...
                               ...                        
30150    4673 Eastern Avenue North, Seattle, Washington...
30151    4131 44th Avenue Southwest, Seattle, Washingto...
30152    910 Martin Luther King Jr Way, Seattle, Washin...
30153    17127 114th Avenue Southeast, Renton, Washingt...
30154    18615 7th Avenue South, Burien, Washington 981...
Name: address, Length: 30155, dtype: object

In [6]:
df['address'][0]

'2102 Southeast 21st Court, Renton, Washington 98055, United States'

In [7]:
df['nuisance']

0         NO
1        YES
2         NO
3         NO
4        YES
        ... 
30150     NO
30151     NO
30152    YES
30153     NO
30154     NO
Name: nuisance, Length: 30155, dtype: object

In [8]:
df['grade']

0        7 Average
1        7 Average
2        7 Average
3         9 Better
4        7 Average
           ...    
30150       8 Good
30151    7 Average
30152    7 Average
30153       8 Good
30154    7 Average
Name: grade, Length: 30155, dtype: object

In [9]:
df['grade'].value_counts()

7 Average        11697
8 Good            9410
9 Better          3806
6 Low Average     2858
10 Very Good      1371
11 Excellent       406
5 Fair             393
12 Luxury          122
4 Low               51
13 Mansion          24
3 Poor              13
2 Substandard        2
1 Cabin              2
Name: grade, dtype: int64

In [10]:
df['price'].value_counts()

650000.0     346
750000.0     307
800000.0     299
850000.0     296
600000.0     295
            ... 
838355.0       1
903888.0       1
576200.0       1
1081500.0      1
720902.0       1
Name: price, Length: 5202, dtype: int64

In [11]:
schools_df = pd.read_csv('/Users/aheinke/Documents/Flatiron/NYC-DS-010923/Phase_2/Phase2_Final_Proj/Report_Card_Assessment_Data_2021-22_School_year.csv')
schools_df

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,SchoolYear,OrganizationLevel,County,ESDName,ESDOrganizationId,DistrictCode,DistrictName,DistrictOrganizationId,SchoolCode,SchoolName,...,Count of students expected to test including previously passed,CountMetStandard,PercentMetStandard,PercentLevel1,PercentLevel2,PercentLevel3,PercentLevel4,PercentMetTestedOnly,PercentNoScore,DataAsOf
0,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,176.0,54.0,30.7%,0.460227,0.221591,0.198864,0.107955,0.310345,0.011364,9/7/2022
1,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,107.0,35.0,32.7%,0.420561,0.224299,0.196262,0.130841,0.336538,0.028037,9/7/2022
2,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,111.0,39.0,35.1%,0.441441,0.198198,0.198198,0.153153,0.354545,0.009009,9/7/2022
3,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
4,2021-22,District,Grays Harbor,Capital Region ESD 113,100004.0,14005.0,Aberdeen School District,100010.0,,District Total,...,,,Suppressed: N<10,,,,,,,9/7/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760171,2021-22,State,Multiple,State Total,,,State Total,,,State Total,...,25809.0,8952.0,34.7%,0.343795,0.234414,0.227944,0.118912,0.374953,0.074935,9/7/2022
760172,2021-22,State,Multiple,State Total,,,State Total,,,State Total,...,280558.0,161649.0,57.6%,0.173644,0.196114,0.305588,0.270571,0.609101,0.054083,9/7/2022
760173,2021-22,State,Multiple,State Total,,,State Total,,,State Total,...,289153.0,114128.0,39.5%,0.318903,0.228908,0.190969,0.203724,0.418771,0.057496,9/7/2022
760174,2021-22,State,Multiple,State Total,,,State Total,,,State Total,...,537988.0,203577.0,37.8%,0.328708,0.236754,0.189959,0.188442,0.400907,0.056137,9/7/2022


In [12]:
schools_df.describe()

Unnamed: 0,ESDOrganizationId,DistrictCode,DistrictOrganizationId,SchoolCode,SchoolOrganizationId,Count of Students Expected to Test,Count of students expected to test including previously passed,CountMetStandard,PercentLevel1,PercentLevel2,PercentLevel3,PercentLevel4,PercentMetTestedOnly,PercentNoScore
count,754163.0,749376.0,749376.0,587239.0,587239.0,255154.0,255154.0,255154.0,385285.0,385285.0,385285.0,385285.0,385285.0,385285.0
mean,100055.384047,22339.034248,100219.933789,3599.351349,102542.400409,399.483426,399.511232,178.480706,0.281602,0.231202,0.247096,0.181734,0.456136,0.058366
std,537.279292,10374.784782,603.587476,1097.526505,1672.9957,5756.888239,5757.043371,2680.040846,0.167085,0.092492,0.10945,0.146054,0.201013,0.110698
min,100001.0,1109.0,100001.0,1502.0,100326.0,10.0,10.0,2.0,0.0,0.0,0.0,0.0,0.011278,0.0
25%,100003.0,17001.0,100084.0,2756.0,101338.0,29.0,29.0,10.0,0.151786,0.173913,0.166667,0.073171,0.3,0.0
50%,100006.0,21401.0,100159.0,3471.0,102241.0,60.0,60.0,24.0,0.262517,0.230769,0.23913,0.146853,0.444444,0.022504
75%,100007.0,31201.0,100231.0,4392.0,103074.0,146.0,146.0,62.0,0.393939,0.283391,0.316176,0.254658,0.6,0.060293
max,105886.0,39801.0,106675.0,5953.0,106790.0,564796.0,564799.0,285824.0,0.938931,0.875,0.866667,0.902655,1.0,0.968481


In [13]:
schools_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760176 entries, 0 to 760175
Data columns (total 29 columns):
 #   Column                                                          Non-Null Count   Dtype  
---  ------                                                          --------------   -----  
 0   SchoolYear                                                      760176 non-null  object 
 1   OrganizationLevel                                               760176 non-null  object 
 2   County                                                          760176 non-null  object 
 3   ESDName                                                         755209 non-null  object 
 4   ESDOrganizationId                                               754163 non-null  float64
 5   DistrictCode                                                    749376 non-null  float64
 6   DistrictName                                                    760176 non-null  object 
 7   DistrictOrganizationId                

In [14]:
king_school_df = schools_df[schools_df.County == "King"].copy()

In [15]:
king_school_df['DistrictName'].value_counts()

Seattle School District No. 1              27136
Lake Washington School District            14470
Kent School District                       12693
Federal Way School District                12013
Highline School District                    9768
Northshore School District                  9572
Renton School District                      9106
Bellevue School District                    8219
Auburn School District                      8216
Issaquah School District                    7862
Shoreline School District                   5097
Snoqualmie Valley School District           4148
Tahoma School District                      3478
Enumclaw School District                    3264
Tukwila School District                     2342
Riverview School District                   2162
Mercer Island School District               1849
Vashon Island School District               1528
Rainier Prep Charter School District         698
Summit Public School: Atlas                  592
Skykomish School Dis

In [17]:
#Let's create a subset for our housing data to fit the criteria for our stakeholders
kc_family_house_df = df[(df['bedrooms'] >= 4) & (df['nuisance'] == "NO")]
#Reset the index
kc_family_house_df.reset_index(drop=True,inplace= True)
#Preview first 5 rows of subset
kc_family_house_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long
0,7399300360,5/24/2022,675000.0,4,1.0,1180,7140,1.0,NO,NO,...,PUBLIC,1180,0,0,40,1969,0,"2102 Southeast 21st Court, Renton, Washington ...",47.461975,-122.19052
1,1180000275,9/29/2021,311000.0,6,2.0,2880,6156,1.0,NO,NO,...,PUBLIC,1580,1580,0,0,1956,0,"8504 South 113th Street, Seattle, Washington 9...",47.502045,-122.2252
2,2944500680,3/17/2022,780000.0,4,2.5,2340,8125,2.0,NO,NO,...,PUBLIC,2340,0,440,70,1989,0,"2721 Southwest 343rd Place, Federal Way, Washi...",47.29377,-122.36932
3,2619950340,6/21/2021,975000.0,4,2.5,2980,5859,2.0,NO,NO,...,PUBLIC,2980,0,540,170,2011,0,"27950 Northeast 147th Circle, Duvall, Washingt...",47.73317,-121.965305
4,2873000690,6/11/2021,680000.0,4,3.0,2130,7649,1.0,NO,NO,...,PUBLIC,1130,1100,440,280,1975,0,"20432 130th Place Southeast, Kent, Washington ...",47.418155,-122.16696


In [18]:
kc_family_house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11054 entries, 0 to 11053
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             11054 non-null  int64  
 1   date           11054 non-null  object 
 2   price          11054 non-null  float64
 3   bedrooms       11054 non-null  int64  
 4   bathrooms      11054 non-null  float64
 5   sqft_living    11054 non-null  int64  
 6   sqft_lot       11054 non-null  int64  
 7   floors         11054 non-null  float64
 8   waterfront     11054 non-null  object 
 9   greenbelt      11054 non-null  object 
 10  nuisance       11054 non-null  object 
 11  view           11054 non-null  object 
 12  condition      11054 non-null  object 
 13  grade          11054 non-null  object 
 14  heat_source    11053 non-null  object 
 15  sewer_system   11047 non-null  object 
 16  sqft_above     11054 non-null  int64  
 17  sqft_basement  11054 non-null  int64  
 18  sqft_g

In [19]:
kc_family_house_df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,lat,long
count,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0,11054.0
mean,4588934000.0,1371442.0,4.31328,2.827755,2757.591279,17325.16,1.661706,2349.729962,567.531663,445.106477,260.396689,1982.609915,85.799168,47.481833,-121.930794
std,2896914000.0,1057857.0,0.608759,0.837062,962.015735,50164.48,0.492704,949.663211,669.141408,284.598893,267.207757,30.015171,405.388921,0.697157,2.931898
min,1000094.0,37440.0,4.0,0.0,3.0,674.0,1.0,2.0,0.0,0.0,0.0,1900.0,0.0,25.914681,-122.52004
25%,2024064000.0,760000.0,4.0,2.5,2100.0,5706.0,1.0,1600.0,0.0,310.0,70.0,1963.0,0.0,47.389735,-122.29683
50%,3913700000.0,1100000.0,4.0,2.5,2600.0,7899.0,2.0,2250.0,0.0,468.0,194.5,1988.0,0.0,47.546028,-122.19058
75%,7334625000.0,1669750.0,5.0,3.0,3210.0,11331.0,2.0,2930.0,1110.0,620.0,360.0,2006.0,0.0,47.661055,-122.088734
max,9904000000.0,23500000.0,13.0,10.5,15360.0,1722798.0,4.0,12660.0,8020.0,3390.0,4370.0,2022.0,2022.0,48.19496,-73.776134


In [21]:
kc_family_house_df.duplicated().value_counts()

False    11054
dtype: int64

In [28]:
# Dropping any rows with null values, since there is not that many, so it shouldn't really affect the 
# success of our dataset.
kc_family_house_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kc_family_house_df.dropna(inplace=True)


In [30]:
# As well, dropping the 'nuisance' column, since this dataset was filtered to only inlcude rows that
# returned a value of "NO" in our 'nuisance' column.
kc_family_house_df.drop(columns = ['nuisance'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [31]:
kc_family_house_df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long
0,7399300360,5/24/2022,675000.0,4,1.0,1180,7140,1.0,NO,NO,...,PUBLIC,1180,0,0,40,1969,0,"2102 Southeast 21st Court, Renton, Washington ...",47.461975,-122.190520
1,1180000275,9/29/2021,311000.0,6,2.0,2880,6156,1.0,NO,NO,...,PUBLIC,1580,1580,0,0,1956,0,"8504 South 113th Street, Seattle, Washington 9...",47.502045,-122.225200
2,2944500680,3/17/2022,780000.0,4,2.5,2340,8125,2.0,NO,NO,...,PUBLIC,2340,0,440,70,1989,0,"2721 Southwest 343rd Place, Federal Way, Washi...",47.293770,-122.369320
3,2619950340,6/21/2021,975000.0,4,2.5,2980,5859,2.0,NO,NO,...,PUBLIC,2980,0,540,170,2011,0,"27950 Northeast 147th Circle, Duvall, Washingt...",47.733170,-121.965305
4,2873000690,6/11/2021,680000.0,4,3.0,2130,7649,1.0,NO,NO,...,PUBLIC,1130,1100,440,280,1975,0,"20432 130th Place Southeast, Kent, Washington ...",47.418155,-122.166960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11049,6743700362,6/17/2021,2080000.0,4,2.5,3280,7200,2.0,NO,NO,...,PUBLIC,3280,0,570,140,2010,0,"12716 Northeast 106th Lane, Kirkland, Washingt...",47.695630,-122.171095
11050,8081900170,3/29/2022,2650000.0,4,3.5,3270,9200,2.0,NO,NO,...,PUBLIC,2410,1060,0,40,1925,0,"3303 Northwest 72nd Street, Seattle, Washingto...",47.680710,-122.400025
11051,2539500005,12/14/2021,880000.0,4,2.5,2260,5661,1.0,NO,NO,...,PUBLIC,1310,1300,460,230,2000,0,"2830 South Bradford Place, Seattle, Washington...",47.568800,-122.295505
11052,6142050730,6/3/2022,1325000.0,5,3.5,3100,6428,2.0,NO,NO,...,PUBLIC,3100,0,710,110,2018,0,"28910 Northeast 156th Street, Duvall, Washingt...",47.740145,-121.952390
