In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
data_train=pd.read_csv("../dataset/train.csv")
data_test=pd.read_csv("../dataset/test.csv")
data_train.head(5)

In [None]:
#Avaliando variáveis com missing
data_train_mis = (data_train.isnull().sum() / len(data_train)) * 100
data_train_mis = data_train_mis.drop(data_train_mis[data_train_mis == 0].index).sort_values(ascending=False)
data_train_mis.head(20)

In [None]:
#Dropando variáveis com mais de 50% de Missing
data_train.drop(['PoolQC','MiscFeature','Alley','Fence'], axis=1,inplace=True)
data_test.drop(['PoolQC','MiscFeature','Alley','Fence'], axis=1,inplace=True)


In [None]:
data_train.head(5)

In [None]:
#Tratamento das variáveis missings - inputs diretos de valor correspondente

for col in ('FireplaceQu','GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'MasVnrType'):
    data_train[col] = data_train[col].fillna('None')
    data_test[col] = data_test[col].fillna('None')
    
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 
            'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    data_train[col] = data_train[col].fillna(0)
    data_test[col] = data_test[col].fillna(0)


In [None]:
#Preenchimento de Missings com a mediana

data_train["LotFrontage"] = data_train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

data_test["LotFrontage"] = data_test.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

#Preenchendo Missings com valor mais comum
data_train['Electrical'] = data_train['Electrical'].fillna(data_train['Electrical'].mode()[0])
data_test['Electrical'] = data_test['Electrical'].fillna(data_test['Electrical'].mode()[0])

#Avaliando variáveis com missing
data_train_mis = (data_train.isnull().sum() / len(data_train)) * 100
data_train_mis = data_train_mis.drop(data_train_mis[data_train_mis == 0].index).sort_values(ascending=False)
data_train_mis.head(20)

In [None]:
#Drop de variáveis específicas identificadas durante analise descritiva

#Todos estão com mesmo valor, pouco poder discriminativo
data_train.drop(['Utilities'], axis=1,inplace=True)
data_test.drop(['Utilities'], axis=1,inplace=True)

#Quase 100% dos casos possuem Y
data_train.drop(['CentralAir'], axis=1,inplace=True)
data_test.drop(['CentralAir'], axis=1,inplace=True)

#Muito Correlacionada com GarageCars
data_train.drop(['GarageArea'], axis=1,inplace=True)
data_test.drop(['GarageArea'], axis=1,inplace=True)

#Muito Correlacionada com TotalBsmtSF
data_train.drop(['1stFlrSF'], axis=1,inplace=True)
data_test.drop(['1stFlrSF'], axis=1,inplace=True)

#Muito Correlacionada com GrLivArea
data_train.drop(['TotRmsAbvGrd'], axis=1,inplace=True)
data_test.drop(['TotRmsAbvGrd'], axis=1,inplace=True)

data_train.head()

In [None]:
onehot_encoder = OneHotEncoder(sparse=False)
encoder = preprocessing.OneHotEncoder()

In [None]:
data_train_c = data_train.copy()
cats = []
for col in data_train_c.columns:
    if data_train_c[col].dtype == object:
        data_train_c = data_train_c.join(pd.get_dummies(data_train_c[col], prefix=col), how='left')
        data_train_c.drop(col, axis=1, inplace=True)
    

print('Dims', data_train_c.shape)
data_train_c.fillna(-1, inplace=True)

In [None]:
data_test_c = data_test.copy()
cats = []
for col in data_test_c.columns:
    if data_test_c[col].dtype == object:
        data_test_c = data_test_c.join(pd.get_dummies(data_test_c[col], prefix=col), how='left')
        data_test_c.drop(col, axis=1, inplace=True)
    

print('Dims', data_test_c.shape)
data_test_c.fillna(-1, inplace=True)

In [None]:
data_train_c.describe()

In [None]:
data_test_c.describe()

In [None]:
data = data_train_c

In [None]:
data.describe()

In [None]:
print(data.shape)

In [None]:
datax=data

In [None]:
data.drop(['Id'],axis=1,inplace=True)

In [None]:
y=data['SalePrice']

In [None]:
datax.drop(['SalePrice'],axis=1,inplace=True)

In [None]:
X , y = datax , y

In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train,y_train)

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results.head()

In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

In [None]:
xgb.plot_importance(xg_reg, importance_type = 'gain' ,max_num_features = 20,title = 'Importância no modelo')
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
xgb.plot_importance(xg_reg ,max_num_features = 20,title ='Mais usadas na árvore de decisão')
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
xgb.plot_importance(xg_reg )
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
help(xg_reg)

In [None]:
help(xgb.plot_importance)