# Kaggle Competions

## House Prices: Advanced Regression Techniques

## Predict sales prices and practice feature engineering, RFs, and gradient boosting



In [1]:
import pandas as pd
import numpy as np

# 1 - Importando os dados de Treino

In [2]:
df = pd.read_csv("./train.csv")

# 2 - Tratando os dados

In [3]:
df = df.fillna(df.mean())#Tratando os nulos

In [4]:
#criando uma classificação especial para condições que os compradores olham de imediato
df['IdealCondition'] = np.where((df['Street']=="Pave")&(df['Utilities']=="AllPub"),1,0)
df['IdealCondition1'] = np.where((df['OverallCond']==9)&(df['ExterCond']=="Ex" )&(df['ExterQual']=="Ex"),1,0)
df['IdealCondition2'] = np.where((df['LotArea']>=13000),1,0)
df['IdealCondition3'] = np.where((df['TotRmsAbvGrd']>=10),1,0)
df['IdealCondition4'] = np.where((df['Fireplaces']>=2),1,0)

In [5]:
df['Fireplaces'].value_counts(dropna=False)

0    690
1    650
2    115
3      5
Name: Fireplaces, dtype: int64

In [6]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,IdealCondition,IdealCondition1,IdealCondition2,IdealCondition3,IdealCondition4
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,2,2008,WD,Normal,208500,1,0,0,0,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,5,2007,WD,Normal,181500,1,0,0,0,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,9,2008,WD,Normal,223500,1,0,0,0,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,2,2006,WD,Abnorml,140000,1,0,0,0,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,12,2008,WD,Normal,250000,1,0,1,0,0


In [7]:
df = df.fillna('ni') #tratando os nan

In [8]:
df = pd.get_dummies(df, drop_first=True) #criando dummies para as categóricas

# 3 - Treinando o modelo de Machine Learning

### 3.1 - Importando o Modelo

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

### 3.2 Instanciar em variáveis os modelos desejados

In [10]:
modelo_linear_regression = LinearRegression()
modelo_randon_forest = RandomForestRegressor()
modelo_gradient = GradientBoostingRegressor()

### 3.3 - Separar os dados em variáveis explicativas X e explicadas/target Y

In [11]:
#criando o x e o y
df_numeric = df.select_dtypes(include=[np.number]) 
numericas = list(df_numeric)
X = df[numericas].drop(['SalePrice'], axis = 1)
y = df['SalePrice']

### 3.4 Separar os dados em treino e teste

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [13]:
X_train.shape

(1095, 267)

In [14]:
y_train.shape

(1095,)

In [15]:
X_train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1198,1199,20,70.0,9100,7,5,2001,2001,0.0,0,...,0,0,0,0,1,0,0,0,1,0
1194,1195,60,80.0,9364,6,7,1969,1969,143.0,371,...,0,0,0,0,1,0,0,0,1,0
990,991,60,82.0,9452,8,5,1997,1998,423.0,1074,...,0,0,0,0,1,0,0,0,1,0
1037,1038,60,70.049958,9240,8,5,2001,2002,396.0,0,...,0,0,0,0,1,0,0,0,1,0
1323,1324,30,50.0,5330,4,7,1940,1950,0.0,280,...,0,0,0,0,1,0,0,0,1,0


### 3.5 Treinar o modelo com o .fit

__Modelo Linear__

In [16]:
modelo_linear_regression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

__Modelo Randon Forest__

In [17]:
modelo_randon_forest.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

__Modelo Gradient Boosting__

In [18]:
modelo_gradient.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

### 3.6 Analisando as Métricas

In [19]:
modelo_linear_regression.score(X_train, y_train)

0.9383835845608273

In [20]:
modelo_linear_regression.score(X_test, y_test)

0.10362779181799031

In [21]:
modelo_randon_forest.score(X_train, y_train)

0.972064251466445

In [22]:
modelo_randon_forest.score(X_test, y_test)

0.8585250919629958

In [23]:
modelo_gradient.score(X_train, y_train)

0.970231124672872

In [24]:
modelo_gradient.score(X_test, y_test)

0.8990914227513834

### 3.7 Medindo MSE (erros médios quadráticos) e o MAE (erros médios absolutos)

In [25]:
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error

In [26]:
# primeiro modelo - regressão linear
yhat_train = modelo_linear_regression.predict(X_train) # previsao dos dados de treino para calcular as métricas
yhat_test = modelo_linear_regression.predict(X_test) # previsao dos dados de teste para calcular métricas

print('TREINO - MSE:', mean_squared_error(y_train, yhat_train), 'MAE:', median_absolute_error(y_train, yhat_train),'R2:', r2_score(y_train, yhat_train))
print('TESTE - MSE:', mean_squared_error(y_test, yhat_test), 'MAE:', median_absolute_error(y_test, yhat_test), 'R2:', r2_score(y_train, yhat_train))

TREINO - MSE: 385633606.4158804 MAE: 9124.306291064713 R2: 0.9383835845608273
TESTE - MSE: 5780598120.096325 MAE: 11884.567362083588 R2: 0.9383835845608273


In [27]:
# segundo modelo -  randon forest
yhat_train = modelo_randon_forest.predict(X_train) # previsao dos dados de treino para calcular as métricas
yhat_test = modelo_randon_forest.predict(X_test) # previsao dos dados de teste para calcular métricas

print('TREINO - MSE:', mean_squared_error(y_train, yhat_train), 'MAE:', median_absolute_error(y_train, yhat_train), 'R2:', r2_score(y_train, yhat_train))
print('TESTE - MSE - test', mean_squared_error(y_test, yhat_test), 'MAE - test', median_absolute_error(y_test, yhat_test),'R2 - test',r2_score(y_test, yhat_test))


TREINO - MSE: 174839178.4581644 MAE: 4650.0 R2: 0.972064251466445
TESTE - MSE - test 912354912.361863 MAE - test 13075.0 R2 - test 0.8585250919629958


In [28]:
# terceiro modelo -  gradient boosting
yhat_train = modelo_gradient.predict(X_train) # previsao dos dados de treino para calcular as métricas
yhat_test = modelo_gradient.predict(X_test) # previsao dos dados de teste para calcular métricas

print('TREINO - MSE:', mean_squared_error(y_train, yhat_train), 'MAE:', median_absolute_error(y_train, yhat_train), 'R2:', r2_score(y_train, yhat_train))
print('TESTE - MSE - test', mean_squared_error(y_test, yhat_test), 'MAE - test', median_absolute_error(y_test, yhat_test),'R2 - test',r2_score(y_test, yhat_test))


TREINO - MSE: 186312018.79437384 MAE: 7807.404211096524 R2: 0.970231124672872
TESTE - MSE - test 650747453.5918511 MAE - test 11542.909658968594 R2 - test 0.8990914227513834


### 3.8 Calculando o Cross Value Score

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
cross_val_score(modelo_linear_regression, X, y, cv=10, )

array([ 0.33443889,  0.27540712,  0.91414828,  0.75200337,  0.89952061,
        0.66115497,  0.88966514,  0.90024345,  0.44333418, -1.02377811])

In [31]:
np.array([ 0.28656114,  0.24762271,  0.9134862 ,  0.75320585,  0.90147817,
        0.66279271,  0.88436995,  0.89679933,  0.41702768, -1.2538947]).mean()

0.470944904

In [32]:
cross_val_score(modelo_randon_forest, X, y, cv=10, )

array([0.84842961, 0.87142813, 0.90404396, 0.74224081, 0.8508516 ,
       0.87308717, 0.85467841, 0.89307141, 0.80089726, 0.86358004])

In [33]:
np.array([0.84893751, 0.87670874, 0.89451226, 0.76524144, 0.88060291,
       0.86309449, 0.87590685, 0.86169704, 0.81385006, 0.80195421]).mean()

0.8482505510000001

In [34]:
cross_val_score(modelo_gradient, X, y, cv=10, )

array([0.90671647, 0.90405049, 0.93384549, 0.81011995, 0.90197989,
       0.89238139, 0.87373673, 0.90519641, 0.88639343, 0.88043222])

In [35]:
np.array([0.90667346, 0.90807287, 0.93331885, 0.80130777, 0.89916021,
       0.88701293, 0.88773499, 0.90605457, 0.88907579, 0.87933776]).mean()

0.8897749200000001

# 4 - Treinando o Modelo Escolhido - Gradiente Boosting

In [36]:
X = df[numericas].drop(['SalePrice'], axis = 1)
Y = df['SalePrice']

modelo_final_gradient = GradientBoostingRegressor(max_depth=3).fit(X,Y)

print(modelo_gradient.score(X_train, y_train))
print(modelo_gradient.score(X_test, y_test))

0.970231124672872
0.8990914227513834


# 5 Rodar a Previsão de SalePrice

In [37]:
df_new = pd.read_csv('./test.csv')

In [38]:
df_new.shape

(1459, 80)

In [39]:
df_new['IdealCondition'] = np.where((df_new['Street']=="Pave")&(df_new['Utilities']=="AllPub"),1,0)
df_new['IdealCondition1'] = np.where((df_new['OverallCond']==9)&(df_new['ExterCond']=="Ex" )&(df_new['ExterQual']=="Ex"),1,0)
df_new['IdealCondition2'] = np.where((df_new['LotArea']>=13000),1,0)
df_new['IdealCondition3'] = np.where((df_new['TotRmsAbvGrd']>=10),1,0)
df_new['IdealCondition4'] = np.where((df_new['Fireplaces']>=2),1,0)

In [40]:
df_new = df_new.fillna(df_new.mean())#Tratando os nulos

In [41]:
df_new = df_new.fillna('ni') #tratando os nan

In [42]:
df_new = pd.get_dummies(df_new, drop_first=True) #criando dummies para as categóricas

In [43]:
df_new.shape

(1459, 255)

In [44]:
df_numeric_new = df_new.select_dtypes(include=[np.number]) 
numericas_new = list(df_numeric_new)

In [45]:
df_numeric_new.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_ni,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [46]:
df_new['Utilities_NoSeWa']='0'
df_new['Condition2_RRAe']='0'
df_new['Condition2_RRAn']='0'
df_new['Condition2_RRNn']='0'
df_new['HouseStyle_2.5Fin']='0'
df_new['RoofMatl_CompShg']='0'
df_new['RoofMatl_Membran']='0'
df_new['RoofMatl_Metal']='0'
df_new['RoofMatl_Roll']='0'
df_new['Exterior1st_ImStucc']='0'
df_new['Exterior1st_Stone']='0'
df_new['Exterior2nd_Other']='0'
df_new['Heating_GasA']='0'
df_new['Heating_OthW']='0'
df_new['Electrical_Mix']='0'
df_new['Electrical_ni']='0'
df_new['GarageQual_Fa']='0'
df_new['PoolQC_Fa']='0'
df_new['MiscFeature_TenC']='0'

In [47]:
X_Real = df_new[list(X)] #determinando o X

In [48]:
#Y_real = df_new['SalePrice'] #determinando o Y

In [49]:
yhat_real = modelo_final_gradient.predict(X_Real) #

In [50]:
yhat_real

array([121642.42639582, 164823.42632898, 180398.57722394, ...,
       164680.73270212, 122386.29131883, 245575.27527525])

In [51]:
df_new['SalePrice_New'] = yhat_real
df_new.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Exterior1st_Stone,Exterior2nd_Other,Heating_GasA,Heating_OthW,Electrical_Mix,Electrical_ni,GarageQual_Fa,PoolQC_Fa,MiscFeature_TenC,SalePrice_New
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,0,0,0,0,0,0,121642.426396
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,0,0,0,0,0,0,164823.426329
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,0,0,0,0,0,0,180398.577224
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,0,0,0,0,0,0,182436.767554
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,0,0,0,0,0,0,188082.403848


In [52]:
df_resultado = pd.DataFrame(df_new, columns=['Id','SalePrice_New'])

In [53]:
df_resultado.head()

Unnamed: 0,Id,SalePrice_New
0,1461,121642.426396
1,1462,164823.426329
2,1463,180398.577224
3,1464,182436.767554
4,1465,188082.403848


In [54]:
df_resultado.to_csv(r'./sample_submission.csv', index=False)

In [55]:
df_resultado.shape

(1459, 2)

In [56]:
df_resultado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
Id               1459 non-null int64
SalePrice_New    1459 non-null float64
dtypes: float64(1), int64(1)
memory usage: 22.9 KB
