In [2]:
# Basic imports
import pandas as pd
import numpy as np

from datetime import datetime
from pydantic import BaseModel

# for Q-Q plots
import scipy.stats as stats
import re

In [3]:
# Loading of the dataset via pandas
kc_data = pd.read_csv("data/King_County_House_prices_dataset.csv")
kc_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [4]:
kc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   view           21534 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [5]:
class DataValidation(BaseModel):
    id: int 
    date: object 
    price: float
    bedrooms: int  
    bathrooms: float
    sqft_living: int  
    sqft_lot: int  
    floors: float
    waterfront: float
    view: float
    condition: int  
    grade: int  
    sqft_above: int  
    sqft_basement: object 
    yr_built: int  
    yr_renovated: float
    zipcode: int  
    lat: float
    long: float
    sqft_living15: int 
    sqft_lot15: int

To be able to apply the pydantic model to our `pandas dataframe` and validate the types we need to define a little helper class because the `DataValidation` class only accepts dictionaries. We have to transform the `pandas dataframe` into a dictionary.

In [6]:
class DataframeValidation(BaseModel):
        df_as_dict: list[DataValidation]
        
data_dict=kc_data.to_dict(orient='records')

DataframeValidation(df_as_dict=data_dict)

DataframeValidation(df_as_dict=[DataValidation(id=7129300520, date='10/13/2014', price=221900.0, bedrooms=3, bathrooms=1.0, sqft_living=1180, sqft_lot=5650, floors=1.0, waterfront=nan, view=0.0, condition=3, grade=7, sqft_above=1180, sqft_basement='0.0', yr_built=1955, yr_renovated=0.0, zipcode=98178, lat=47.5112, long=-122.257, sqft_living15=1340, sqft_lot15=5650), DataValidation(id=6414100192, date='12/9/2014', price=538000.0, bedrooms=3, bathrooms=2.25, sqft_living=2570, sqft_lot=7242, floors=2.0, waterfront=0.0, view=0.0, condition=3, grade=7, sqft_above=2170, sqft_basement='400.0', yr_built=1951, yr_renovated=1991.0, zipcode=98125, lat=47.721, long=-122.319, sqft_living15=1690, sqft_lot15=7639), DataValidation(id=5631500400, date='2/25/2015', price=180000.0, bedrooms=2, bathrooms=1.0, sqft_living=770, sqft_lot=10000, floors=1.0, waterfront=0.0, view=0.0, condition=3, grade=6, sqft_above=770, sqft_basement='0.0', yr_built=1933, yr_renovated=nan, zipcode=98028, lat=47.7379, long=-12

Transforming it into a validation function.

In [7]:
def data_validation(df: pd.DataFrame, data_schema) -> pd.DataFrame:
    class DataframeValidation(BaseModel):
        df_as_dict: list[data_schema]
    df_as_dict = df.to_dict(orient='records')
    DataframeValidation(df_as_dict=df_as_dict)
    return df  

All Transformers in scikit-learn are classes that are build very similarly. If we want to make sure that our custom transformers work seamlessly with the rest of scikit-learn, we simply need to defined a class that inherits from the `BaseEstimator` and `TransformerMixin` classes from the `sklearn.base` module. All our class needs to have is a `.fit()` method (returning self) and a `.transform()` method (additionally we can also add a `.fit_transform()` method).

In [8]:
kc_data_sklearn = kc_data.copy()

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

class BathBedRatioOutlierTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
    
        X["bath_bed_ratio"] = X["bathrooms"] / X["bedrooms"]
        for idx, ratio in enumerate(X["bath_bed_ratio"]):
            if ratio >= 2:
                X.drop(idx, inplace=True)
            elif ratio <= 0.10:
                X.drop(idx, inplace=True)
        return X

In [14]:
Bath_bed_ratio_outlier_transformer = BathBedRatioOutlierTransformer()

kc_data_sklearn = Bath_bed_ratio_outlier_transformer.fit_transform(kc_data_sklearn)

Dealing with 'sqft_basement' column

In [15]:
class SqftBasementTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):

        X["sqft_basement"] = X["sqft_basement"].replace("?", np.nan)
        X["sqft_basement"] = X["sqft_basement"].astype(float)
        X["sqft_basement"] = X["sqft_living"] - X["sqft_above"]
        return X

In [16]:
sqft_basement_transformer = SqftBasementTransformer()
kc_data_sklearn = sqft_basement_transformer.fit_transform(kc_data_sklearn)

Dealing with missing values in 'view' and 'waterfront' column

In [17]:
class FillMissingsViewWFTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X["view"] = X["view"].fillna(0)
        X["waterfront"] = X["waterfront"].fillna(0)
        return X

In [18]:
fill_missings_view_wf_transformer = FillMissingsViewWFTransformer()
kc_data_sklearn = fill_missings_view_wf_transformer.fit_transform(kc_data_sklearn)

Dealing with missing values in "yr_renovated"

In [19]:
class CalculateLastChangeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
            last_known_change = []
            for idx, yr_re in X.yr_renovated.items():
                if str(yr_re) == 'nan' or yr_re == 0.0:
                    last_known_change.append(X.yr_built[idx])
                else:
                    last_known_change.append(int(yr_re))

            X['last_known_change'] = last_known_change
            X.drop("yr_renovated", axis=1, inplace=True)
            X.drop("yr_built", axis=1, inplace=True)
            return X

In [20]:
calculate_last_change_transformer = CalculateLastChangeTransformer()
kc_data_sklearn = calculate_last_change_transformer.fit_transform(kc_data_sklearn)

Dealing with date 

In [21]:
class DateTimeChangeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['date'] = pd.to_datetime(X['date'])
        return X

In [22]:
datetime_change_transformer = DateTimeChangeTransformer()
kc_data_sklearn = datetime_change_transformer.fit_transform(kc_data_sklearn)

Data cleaning pipeline

In [23]:
from sklearn.pipeline import Pipeline

data_cleaning_pipeline = Pipeline([
    ('Bath_bed_ratio_outlier_transformer', BathBedRatioOutlierTransformer()),
    ('sqft_basement_transformer', SqftBasementTransformer()),
    ('fill_missings_view_wf_transformer', FillMissingsViewWFTransformer()),
    ('calculate_last_change_transformer', CalculateLastChangeTransformer()),
    ('datetime_change_transformer', DateTimeChangeTransformer())
     ]
)

Feature Engineering

Creating price per sqft 

In [28]:
class PricePerSqftTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['sqft_price'] = (X.price/(X.sqft_living + X.sqft_lot)).round(2)
        return X

In [29]:
price_per_sqft_transformer = PricePerSqftTransformer()
kc_data_sklearn = price_per_sqft_transformer.fit_transform(kc_data_sklearn)

Calculating the distance between the houses and center of city

In [30]:
class HouseCenterDistanceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):

        X['delta_lat'] = np.absolute(47.62774- X['lat'])
        X['delta_long'] = np.absolute(-122.24194-X['long'])
        X['center_distance']= ((X['delta_long']* np.cos(np.radians(47.6219)))**2 
                                    + X['delta_lat']**2)**(1/2)*2*np.pi*6378/360
        return X

In [31]:
house_center_distance_transformer = HouseCenterDistanceTransformer()
kc_data_sklearn = house_center_distance_transformer.fit_transform(kc_data_sklearn)

Distance between house & waterfront

In [35]:
class HouseWaterfrontDistanceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        def house_beach_promenade_distance(long, lat, ref_long, ref_lat):
            delta_long = long - ref_long
            delta_lat = lat - ref_lat
            delta_long_corr = delta_long * np.cos(np.radians(ref_lat))
            return ((delta_long_corr)**2 +(delta_lat)**2)**(1/2)*2*np.pi*6378/360
        
        water_list= X.query('waterfront == 1')
        water_distance = []
        for idx, lat in X.lat.iteritems():
            ref_list = []
            for x,y in zip(list(water_list.long), list(water_list.lat)):
                ref_list.append(house_beach_promenade_distance(X.long[idx], X.lat[idx],x,y).min())
            water_distance.append(min(ref_list))
        
        X['water_distance'] = water_distance
        return X

In [39]:
# house_waterfront_distance_transformer = HouseWaterfrontDistanceTransformer()
# kc_data_sklearn = house_waterfront_distance_transformer.fit_transform(kc_data_sklearn)

Making the Pipeline for Feature Engineering

In [40]:

Feature_Engineering_pipeline = Pipeline([
    ('price_per_sqft_transformer', PricePerSqftTransformer()),
    ('house_center_distance_transformer', HouseCenterDistanceTransformer()),
    ('house_waterfront_distance_transformer', HouseWaterfrontDistanceTransformer())
     ]
)