In [13]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score
from joblib import parallel_backend
from ray.util.joblib import register_ray
register_ray()
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('HousePrices.csv')
df = df.drop('Amenities',axis=1)

In [3]:
X = df.drop('Price',axis=1)
y = df['Price']

In [6]:
class AreaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X['Area'] = pd.to_numeric(X['Area'].str.replace('[^.0-9]', ''))
        return X
class BHKTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X['BHK'] = pd.to_numeric(X['BHK'].str.replace('[^.0-9]', ''))
        return X
class LocationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        Location_stats = X['Location'].value_counts(ascending=False)
        Location_stats_less_than_10 = Location_stats[Location_stats<=10]
        X.Location = X.Location.apply(lambda x : 'other' if x in Location_stats_less_than_10 else x)
        dummies = pd.get_dummies(X.Location)
        X = pd.concat([X.drop('Location',axis='columns'),dummies.drop('other',axis='columns')],axis='columns')
        return X
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
X_CustomPipeline = Pipeline(steps=[
    ('AreaTransformer',AreaTransformer()),
    ('BHKTransformer',BHKTransformer())
])
X_CustomPipelineLocation = Pipeline(steps=[
    ('LocationTransformer',LocationTransformer()
     )])
X_NumericPipeline = Pipeline(steps=[
    ('Simple Imputer',SimpleImputer(strategy='median')),
])

In [7]:
def convert_price(value):
    if 'Cr' in value:
        new_value1 = re.sub(re.compile('[^.0-9]'), '', value)
        return float(new_value1)*100
    elif 'Lac' in value:
        new_value2 = re.sub(re.compile('[^.0-9]'), '', value)
        return float(new_value2)
    else:
        return None
class PriceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X = X.apply(convert_price)
        X = np.array(X)
        X = X.reshape(-1,1)
        return X
class PriceLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X = X.ravel()
        return np.log(X)
y_CustomPipeline = Pipeline(steps=[
    ('Price Transformer',PriceTransformer())
])
y_NumericPipeline = Pipeline(steps=[
    ('Simple Imputer',SimpleImputer(strategy='median')),
    ('Log Transformer',PriceLogTransformer())
])

In [11]:
X = X_CustomPipelineLocation.transform(X)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

X_train = X_CustomPipeline.transform(X_train)
X_train_final = X_NumericPipeline.fit_transform(X_train[['Area','BHK']])
X_train['Area'] = X_train_final[:,0]
X_train['BHK'] = X_train_final[:,1]

X_test = X_CustomPipeline.transform(X_test)
X_test_final = X_NumericPipeline.fit_transform(X_test[['Area','BHK']])
X_test['Area'] = X_test_final[:,0]
X_test['BHK'] = X_test_final[:,1]

y_train = y_CustomPipeline.fit_transform(y_train)
y_train = y_NumericPipeline.fit_transform(y_train)

y_test = y_CustomPipeline.fit_transform(y_test)
y_test = y_NumericPipeline.fit_transform(y_test)

In [12]:
def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse','squared_error'],
                'splitter': ['best','random']
            }
        },
        'gradient_booster': {
            'model': GradientBoostingRegressor(),
            'params': {
                'loss': ['squared_error', 'absolute_error'],
                'learning_rate': [0.1,1,1.5,2],
                'n_estimators': [10,50,100,150,200],
                'criterion': ['friedman_mse', 'squared_error']
            }
        },
        'ada_booster': {
            'model': AdaBoostRegressor(),
            'params': {
                'n_estimators': [10,50,100,150,200],
                'learning_rate': [0.1,1,1.5,2],
                'loss': ['linear','square','exponential']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        with parallel_backend('ray'): # We are using multithreading to speed up the process of training our models.
            gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

best_model = find_best_model_using_gridsearchcv(X_train,y_train)
print(best_model.head(5))

[2m[36m(PoolActor pid=11680)[0m If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:[32m [repeated 5x across cluster][0m
[2m[36m(PoolActor pid=11680)[0m [32m [repeated 30x across cluster][0m
[2m[36m(PoolActor pid=11680)[0m from sklearn.pipeline import make_pipeline[32m [repeated 5x across cluster][0m
[2m[36m(PoolActor pid=11680)[0m model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())[32m [repeated 5x across cluster][0m
[2m[36m(PoolActor pid=11680)[0m If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:[32m [repeated 5x across cluster][0m
[2m[36m(PoolActor pid=11680)[0m kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}[32m [repeated 5x across cluster][0m
[2m[36m(PoolActor pid=11680)[0m model.fit(X, y, **kwargs)[32m [repeated 5x across cluster][0m
[2m[36m(PoolAc

               model  best_score  \
0  linear_regression    0.700849   
1              lasso    0.306482   
2      decision_tree    0.745617   
3   gradient_booster    0.823123   
4        ada_booster    0.790997   

                                         best_params  
0                               {'normalize': False}  
1                {'alpha': 1, 'selection': 'random'}  
2         {'criterion': 'mse', 'splitter': 'random'}  
3  {'criterion': 'friedman_mse', 'learning_rate':...  
4  {'learning_rate': 0.1, 'loss': 'exponential', ...  


In [14]:
params = dict(best_model.iloc[3]['best_params'])
model = GradientBoostingRegressor(**params)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(f'R2 Score: {r2_score(y_test,y_pred)}')

R2 Score: 0.8189436946366715
