In [66]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


test_data = pd.read_csv("data/test.csv")
train_data = pd.read_csv("data/train.csv")

#Removing rows without sale price
train_data.dropna(axis=0, subset=['SalePrice'], inplace = True)

#Our target
y = train_data.SalePrice

train_data.drop(['SalePrice'],axis=1, inplace =True)

# train_data.fillna()

X_train_full, X_valid_full, y_train, y_valid = train_test_split(train_data, y, 
                                                                train_size=0.9, test_size=0.1,
                                                                random_state=0)


In [67]:
X_train_full.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
930,931,20,RL,73.0,8925,Pave,,IR1,HLS,AllPub,...,0,0,,,,0,7,2009,WD,Normal
656,657,20,RL,72.0,10007,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,8,2008,WD,Normal
45,46,120,RL,61.0,7658,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
1348,1349,20,RL,,16196,Pave,,IR3,Low,AllPub,...,0,0,,,,0,8,2007,WD,Normal
55,56,20,RL,100.0,10175,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,7,2008,WD,Normal


In [68]:
numeric_col = [col_name for col_name in X_train_full.columns 
               if X_train_full[col_name].dtype in ['int64', 'float64']] 
# print(numeric_col)

categ_col = [col_name for col_name in X_train_full.columns 
               if X_train_full[col_name].dtype == 'object'
            and X_train_full[col_name].nunique() <10]
# print(categ_col)

In [70]:
staying_col = numeric_col + categ_col

X_train = X_train_full[staying_col].copy()
X_valid = X_valid_full[staying_col].copy()
X_test = test_data[staying_col].copy()

X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [72]:
X_train.head()


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
930,931,20,73.0,8925,8,5,2007,2007,0.0,16,...,0,0,0,1,0,0,0,0,1,0
656,657,20,72.0,10007,5,7,1959,2006,54.0,806,...,0,0,0,1,0,0,0,0,1,0
45,46,120,61.0,7658,9,5,2005,2005,412.0,456,...,0,0,0,1,0,0,0,0,1,0
1348,1349,20,,16196,7,5,1998,1998,0.0,1443,...,0,0,0,1,0,0,0,0,1,0
55,56,20,100.0,10175,6,5,1964,1964,272.0,490,...,0,0,0,1,0,0,0,0,1,0


In [62]:
from xgboost import XGBRegressor

boost_model = XGBRegressor(n_estimators = 1000, learning_rate =0.05, n_jobs=4)
boost_model.fit(X_train, y_train,
             early_stopping_rounds=4, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
# boost_model.fit(X_train, y_train)

predictions = boost_model.predict(X_valid)
score = mean_squared_error(y_valid, predictions)
print(score)

  if getattr(data, 'base', None) is not None and \


1227685596.2819319


In [73]:
# print(boost_model.get_params)
# print(boost_model.get_xgb_params())

In [63]:
#Making and saving predictions
preds_test = boost_model.predict(X_test)
output = pd.DataFrame({'Id': X_test.index+1461,
                       'SalePrice': preds_test})
output.to_csv('prices_XGB.csv', index=False)