In [14]:
# Import the Dependecies

import numpy as np
import pandas as pd
import plotly.express as go
import plotly.graph_objects as go
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score

In [2]:
# Load the data
df = pd.read_csv("train.csv")
df_t = pd.read_csv("test.csv")

# Remove outliers
df = df[df.GrLivArea < 4000]

# Partition the data
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1, random_state=1234) 

In [3]:
# Sum all of the missing values
null_counts = df.isnull().sum()
null_counts = null_counts[null_counts != 0]
null_counts 

LotFrontage      259
Alley           1365
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1451
Fence           1176
MiscFeature     1402
dtype: int64

In [4]:
num_col = np.array(X.select_dtypes(include=[np.number]).columns)
cat_col = np.array(X.select_dtypes(include=['object']).columns)

In [5]:
class PrepProcessor():
    def fit(self, X, y=None):
        """
        Create and fit numeric and categoric imputers
        """
        self.num_imputer = SimpleImputer(strategy='mean')
        self.cat_imputer = SimpleImputer(strategy='constant', fill_value='Null')
        
        self.num_imputer.fit(X[num_col])
        self.cat_imputer.fit(X[cat_col])
        
        return self
    
    def transform(self, X, y=None):
        """ 
        Transform imputers to fill in values and preprocess data
        """
        X[num_col] = self.num_imputer.transform(X[num_col])
        X[cat_col] = self.cat_imputer.transform(X[cat_col])
        
        
        return X
    
        

In [6]:
# Build the Pipeline

preprocess = PrepProcessor()
num_pipe = Pipeline([('Scaler', StandardScaler())])
cat_pipe = Pipeline([('OneHot', OneHotEncoder(handle_unknown='ignore'))])
transformer = ColumnTransformer([('num', num_pipe, num_col), ('cat', cat_pipe, cat_col)])

In [7]:
pipe = Pipeline([('InitialPreprocess', preprocess), ('Transformer', transformer), ('xgb', XGBRegressor())])

In [8]:
pipe.fit(X_train, y_train)

In [9]:
yhat = pipe.predict(X_test)

In [10]:
yhat

array([128080.03 , 269955.34 , 175605.22 , 221350.5  , 185476.3  ,
       139550.95 , 200140.05 , 202816.48 , 186210.8  , 132104.95 ,
       140258.36 , 189512.77 ,  89792.28 , 118655.59 , 130603.94 ,
       220841.2  , 151467.94 , 178727.9  , 419306.03 , 178517.36 ,
       255371.39 , 300165.56 , 197134.98 , 195707.67 , 294550.34 ,
       130803.16 , 103413.69 , 157262.36 , 174089.53 , 184722.38 ,
       186075.55 , 137586.7  , 132231.97 , 123846.414, 148688.47 ,
       144244.97 , 110144.7  , 100266.98 , 179769.58 , 157168.19 ,
       177156.66 ,  79127.94 , 282014.5  , 146351.86 , 122211.164,
       128421.76 , 173511.75 , 165572.56 , 265309.7  , 330407.28 ,
       312368.16 , 103142.32 , 238859.64 ,  81251.69 , 187006.36 ,
       122855.055, 178650.53 ,  93385.055,  89216.414, 124280.78 ,
       234128.53 , 164702.84 , 278917.66 , 128774.5  , 151741.17 ,
       119372.4  , 124062.68 , 197334.61 , 126892.234, 308146.53 ,
        73404.16 , 172681.19 , 269033.   , 167440.38 , 122518.

In [11]:
y_test

1145    149000
359     280000
1273    177000
444     210000
446     190000
         ...  
581     253293
1120    118400
90      109900
385     192000
625     160000
Name: SalePrice, Length: 146, dtype: int64

Visualize the results to assess the model

In [27]:
# Make a scatterplot of yhat vs y_test

# Create a scatter plot
scatter = go.Scatter(x=y_test, y=yhat, mode='markers')

# # Add axis labels
layout = go.Layout(xaxis_title='y_test', yaxis_title='yhat', title='Predicted vs Actual Y-values')

fig = go.Figure(data=scatter, layout=layout)
fig.show()