In [1]:
# Numpy
import numpy as np
# Pandas
import pandas as pd
# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from pandas.plotting import scatter_matrix
# Statistics
from scipy import stats
from scipy.stats import pearsonr
# Split
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# Preprocessing
from sklearn.preprocessing import StandardScaler, QuantileTransformer, quantile_transform
from sklearn.decomposition import PCA
# Pipeline
from sklearn.pipeline import make_pipeline
# Feature selection
from sklearn.feature_selection import f_regression, SelectKBest
# Models
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xgb
# Metrics
from sklearn.metrics import (make_scorer, 
            mean_squared_error, mean_absolute_error, r2_score)
# Correlation & hierarchy clustering
from scipy.cluster import hierarchy
from scipy.spatial import distance

In [2]:
# Set plots size
plt.rcParams['figure.figsize'] = (10, 4)

## Getting the Data

In [3]:
df = pd.read_csv('house.csv')
# Print No of rows & columns
print(f'No of rows: {df.shape[0]}\nNo of columns: {df.shape[1]}')
# Show first 5 rows
df.head()

No of rows: 21613
No of columns: 22


Unnamed: 0,id,date,price,price_bin,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,0,3,1.0,1180,5650,1.0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,0,3,2.25,2570,7242,2.0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,0,2,1.0,770,10000,1.0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,0,4,3.0,1960,5000,1.0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,0,3,2.0,1680,8080,1.0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
# Check data
df.info()
# Check the missing data
print(f'Missing cells: {df.isnull().sum().sum()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   price_bin      21613 non-null  int64  
 4   bedrooms       21613 non-null  int64  
 5   bathrooms      21613 non-null  float64
 6   sqft_living    21613 non-null  int64  
 7   sqft_lot       21613 non-null  int64  
 8   floors         21613 non-null  float64
 9   waterfront     21613 non-null  int64  
 10  view           21613 non-null  int64  
 11  condition      21613 non-null  int64  
 12  grade          21613 non-null  int64  
 13  sqft_above     21613 non-null  int64  
 14  sqft_basement  21613 non-null  int64  
 15  yr_built       21613 non-null  int64  
 16  yr_renovated   21613 non-null  int64  
 17  zipcode        21613 non-null  int64  
 18  lat   

No missing data

In [5]:
# Describe & inspect dataset
df.describe() 
# Let`s inspect the columns
df.columns 
# Print unique values
print(df.nunique())

id               21436
date               372
price             3625
price_bin            2
bedrooms            13
bathrooms           30
sqft_living       1038
sqft_lot          9782
floors               6
waterfront           2
view                 5
condition            5
grade               12
sqft_above         946
sqft_basement      306
yr_built           116
yr_renovated        70
zipcode             70
lat               5034
long               752
sqft_living15      777
sqft_lot15        8689
dtype: int64
