In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPRegressor

from scipy.stats.mstats import winsorize

In [18]:
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.lof import LOF
import pandas as pd
import numpy as np
import seaborn as snd
import sklearn

## 1.0 Initial Inspection

In [7]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [9]:
df.isnull().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [10]:
print("Categorical Columns:", list(df.select_dtypes(include = 'object').columns))
print("Numerical Columns:", list(df.select_dtypes(exclude = 'object').columns))

Categorical Columns: ['date', 'street', 'city', 'statezip', 'country']
Numerical Columns: ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']


In [11]:
temp = df.groupby(['street']).mean()['price']
temp = pd.concat([df.groupby(['street']).count().rename(columns = {'price': 'count'})['count'], temp], axis = 1)
temp = temp.sort_values(by=['count'],ascending = False)
temp[temp['count']>1]

Unnamed: 0_level_0,count,price
street,Unnamed: 1_level_1,Unnamed: 2_level_1
2520 Mulberry Walk NE,4,4.227450e+05
2500 Mulberry Walk NE,3,4.042713e+05
11716 16th Ave NE,2,3.260000e+05
7490 85th Ave SE,2,7.612500e+05
5010 Greenwood Ave N,2,5.505000e+05
...,...,...
13516 164th Ave SE,2,5.613300e+05
8216 Linden Ave N,2,6.095000e+05
1149-1199 91st Ave NE,2,2.510750e+06
1273 NE Hickory Ln,2,4.570025e+05


## 1.1 Determine which features are continuous vs categorical. Drop rows without a valid sales price.

In [12]:
df[df['price'] == 0]['price'].count()

49

In [13]:
df = df[df['price']!=0]
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [14]:
df['bedrooms'] = df['bedrooms'].astype(int)
df['bathrooms'] = df['bathrooms'].astype(int)
df['floors'] = df['floors'].astype(int)

df['month'] = pd.to_datetime(df['date']).dt.month #.astype(str)
#df['yr_built'] = df['yr_built'].astype(str)
#df['yr_renovated'] = df['yr_renovated'].astype(str)
df.drop(["country"], axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,month
0,2014-05-02 00:00:00,313000.0,3,1,1340,7912,1,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,5
1,2014-05-02 00:00:00,2384000.0,5,2,3650,9050,2,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,5
2,2014-05-02 00:00:00,342000.0,3,2,1930,11947,1,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,5
3,2014-05-02 00:00:00,420000.0,3,2,2000,8030,1,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,5
4,2014-05-02 00:00:00,550000.0,4,2,1940,10500,1,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,5


## 1.2 Visualize the univariate distribution of each continuous feature, and the distribution of the target. Do you notice anything? Is there something that might require special treatment?

In [19]:
numeric_columns = list(df.select_dtypes(exclude = 'object').columns)
fig = make_subplots(rows=2, cols=3, shared_yaxes=True)
index = 0
for row in range(1,3):
    for col in range(1,4):
        fig.add_trace(go.Box(x=df[numeric_columns[index]], name = numeric_columns[index]),
                      row = row,
                      col = col,
)
        index = index + 1
fig.update_layout(height=600, width=1300,
                  title_text="Box-Plots of Numerical Variables")

fig.show()

NameError: name 'make_subplots' is not defined