In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [11]:
df = pd.read_csv("data.csv")
print("attributes ")
print(df.columns)
print("data instances")
print(df.count())
print("null va;ues")
print(df.isnull().sum())

attributes 
Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')
data instances
date             4600
price            4597
bedrooms         4595
bathrooms        4598
sqft_living      4598
sqft_lot         4598
floors           4598
waterfront       4595
view             4598
condition        4600
sqft_above       4597
sqft_basement    4597
yr_built         4594
yr_renovated     4600
street           4597
city             4596
statezip         4597
country          4594
dtype: int64
null va;ues
date             0
price            3
bedrooms         5
bathrooms        2
sqft_living      2
sqft_lot         2
floors           2
waterfront       5
view             2
condition        0
sqft_above       3
sqft_basement    3
yr_built         6
yr_renovated     0
street           3
ci

In [5]:
def fillNaObject(cols):
    for col in cols:
        df[col] = df[col].fillna(df[col].mode()[0])

fillNaObject(['street','city','statezip','country'])

def fillNaInt(cols):
    for col in cols:
        df[col] = df[col].fillna(df[col].mode()[0])

fillNaInt(['bedrooms','bathrooms','floors','waterfront','view','yr_built'])

def fillNaFloat(cols):
    for col in cols:
        df[col] = df[col].fillna(df[col].mean())

fillNaFloat(['price','sqft_living','sqft_lot','sqft_above','sqft_basement'])

In [6]:
def convertFloattoInt(cols):
    for col in cols:
        df[col] = df[col].astype('int64')

convertFloattoInt(['price','bedrooms', 'bathrooms','floors','waterfront','view', 'yr_built', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement'])

df.drop(columns=['street','country','date'], inplace=True)

In [7]:
def convertObjtoInt(cols):
    for col in cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

convertObjtoInt(['city','statezip'])

df['price'] = df['price'].fillna(df['price'].mode()[0])

In [8]:
trainData, testData = train_test_split(df, test_size=0.2,shuffle=False)

train_x = trainData.iloc[:, 1:]
train_y = trainData.iloc[:, 0]
test_x = testData.iloc[:, 1:]
test_y = testData.iloc[:, 0]
print("missing vakues train_y",train_y.isnull().sum())
print("missing values test_y ",test_y.isnull().sum())

missing vakues train_y 0
missing values test_y  0


In [None]:
print("using GradientBoostingRegressor")
model_gbr = GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
model_gbr.fit(train_x, train_y)

joblib.dump(model_gbr, 'model_gbr.joblib')

model_gbr_loaded = joblib.load('model_gbr.joblib')
model_predictions_gbr = model_gbr_loaded.predict(test_x)

testdata_predict = testData.copy(deep=True)
testdata_predict['Prediction_GBR'] = model_predictions_gbr

model_mse_gbr = mean_squared_error(test_y, model_predictions_gbr)
model_r2_gbr = r2_score(test_y, model_predictions_gbr)
print("GradientBoostingRegressor Mean Squared Error: ",round(model_mse_gbr, 3))
print("GradientBoostingRegressor R2 Score: ",round(model_r2_gbr, 3))

using GradientBoostingRegressor 
GradientBoostingRegressor Mean Squared Error:  1027065269818.388
GradientBoostingRegressor R2 Score:  0.026


In [10]:
print("using XGBoost Regressor")
model_xgb = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
model_xgb.fit(train_x, train_y)

joblib.dump(model_xgb,'model_xgb_regressor.joblib')

model_xgb_loaded = joblib.load('model_xgb_regressor.joblib')
model_predictions_xgb = model_xgb_loaded.predict(test_x)

testdata_predict['Prediction_XGB'] = model_predictions_xgb

model_mse_xgb = mean_squared_error(test_y, model_predictions_xgb)
model_r2_xgb = r2_score(test_y, model_predictions_xgb)
print("XGBoost Regressor mean squared error:",round(model_mse_xgb, 3))
print("XGBoost Regressor R2 score:",round(model_r2_xgb, 3))

using XGBoost Regressor
XGBoost Regressor mean squared error: 1023232057150.284
XGBoost Regressor R2 score: 0.029
