# Projet Kaggle: House Price par ABBOUDI Mohammed Amine

In [None]:
#Librairies a importer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import scipy.stats as stats
from scipy.stats import skew,norm
import missingno as msno

from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.metrics import mean_squared_error
import xgboost
from collections import OrderedDict

from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, Lasso, SGDRegressor, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.svm import LinearSVR,SVR
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import lightgbm as lgb



import warnings
warnings.filterwarnings('ignore')


In [None]:
# Lecture des donnees
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.drop(columns = 'Id', inplace =True)

In [None]:
print("train: {} \ntest: {})".format(train.shape,test.shape))

Le volume de donnees de train et de test sont presque equivalents

In [None]:
with pd.option_context('display.max_columns', None):
    display(train.describe())

On remarque la presence de quelque outliers, il faudra plotter ces variables susceptibles afin de conclure.

# Traitement des Outliers

In [None]:
plt.figure(figsize=(7,5))
plt.scatter(x = train['1stFlrSF'],y = train.SalePrice, c='Black')
plt.title('1stFlrSF', size = 15)
plt.figure(figsize=(7,5))
plt.scatter(x = train.TotalBsmtSF,y = train.SalePrice, c='red')
plt.title('TotalBsmtSF', size = 15)
plt.figure(figsize=(7,5))
plt.scatter(x = train.GrLivArea,y = train.SalePrice, c='green')
plt.title('GrLivArea', size = 15)

In [None]:
# Suppression des outliers
train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice']<300000)].index,inplace = True)
train.drop(train[train['TotalBsmtSF'] > 5000].index,inplace = True)
train.drop(train[train['1stFlrSF'] > 4000].index,inplace = True)
#Taille apres suppresion
train.shape

On a elimine 2 observations seulements, cela peut etre du au fait que plusieurs variables partagent le meme point aberrant.

# Correlation des variables

In [None]:
corr = train.corr()
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10})

In [None]:
# Heatmap des variables les plus correlees. On choisit un threshold de 0.75
zoomCorr = corr.loc[['SalePrice','GrLivArea','TotalBsmtSF','OverallQual','FullBath','TotRmsAbvGrd','YearBuilt','1stFlrSF','GarageYrBlt','GarageCars','GarageArea'], ['SalePrice','GrLivArea','TotalBsmtSF','OverallQual','FullBath','TotRmsAbvGrd','YearBuilt','1stFlrSF','GarageYrBlt','GarageCars','GarageArea']]
f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation des variables numeriques',size=15)
sns.heatmap(zoomCorr, square = True, linewidths=0.01, vmax=0.75, annot=True,cmap='viridis', linecolor="white", annot_kws = {'size':12})

    -GarageCars et GarageArea sont tres correlees (0.89)
    -SalePrice avec OverallQual (0.8)
    -TotalBsmtSF avec 1stFlrSF d'ou l'idee de creer une nouvelle variable qui combine toutes les surfaces.

In [None]:
# Suppression de GarageCars

train.drop(columns = 'GarageCars', inplace =True)
test.drop(columns = 'GarageCars', inplace =True)
print("train: {} \ntest: {})".format(train.shape,test.shape))

In [None]:
y=train['SalePrice']

# Traitement des valeurs manquantes

On commence par visualiser les colonnes numeriques avec des valeurs manquantes a l'aide de la librairie missingno , Merci MAJDOUBI!

In [None]:
# Concatenation du train et test sets
df = pd.concat([train,test],ignore_index=True)
df.drop(['Id','SalePrice'],axis = 1,inplace = True)

###### Variables numeriques

In [None]:
msno.matrix(df.select_dtypes(include=[np.number]).sample(200))

In [None]:
msno.matrix(test.select_dtypes(include=[np.number]).sample(200))

Les memes trois variables du testing et training sets ont des valeurs manquantes.

In [None]:
df.select_dtypes(include=[np.number]).info()

In [None]:
# Remplacement par la moyenne de la variable MasVnrArea
df['MasVnrArea'].fillna(df['MasVnrArea'].mean(), inplace=True)
#Puisuqe YearBuilt et GarageYrBlt sont tres correlees on remplace toute observation manquante par YearBlt correspondente
df['GarageYrBlt'].fillna(df['YearBuilt'], inplace=True)

for col in ['BsmtFullBath','BsmtHalfBath','BsmtUnfSF','TotalBsmtSF','BsmtFinSF2','BsmtFinSF1','GarageArea']:
    df[col].fillna(0,inplace= True)
# Remplacement de LotFrontage par la mediane en la groupant avec Neighborhood puisqu'elles sont tres correlees
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
lambda x: x.fillna(x.median()))

###### Variables Categorielles

In [None]:
msno.matrix(df.select_dtypes(include=[np.object]).sample(200))

In [None]:
msno.matrix(test.select_dtypes(include=[np.object]).sample(200))

Pour une grande partie des variables categorielles, une valeur manquante signifie que la maison ne possede pas cette propriete, comme pour PoolQC.

In [None]:
df.select_dtypes(include=[np.object]).info()

In [None]:
# On les remplace par 'None'
for col in ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageQual','GarageCond','GarageFinish','GarageType','BsmtExposure','BsmtCond','BsmtQual','BsmtFinType2','BsmtFinType1','MasVnrType']:
    df[col].fillna('None',inplace = True)

Pour les autres on remplace par la valeur la plus frequente. donne par la fonction mode.

In [None]:
for col in ['MSZoning','Functional','Utilities','KitchenQual','SaleType','Exterior2nd','Exterior1st','Electrical']:
    df[col].fillna(df[col].mode()[0],inplace= True)

Voila, le dataset n'a plus de valeurs manquantes.

## Processing de Variables

Une grande partie des variables numeriques, tel que YrBuilt ou MSSubClass n'ont pas de raison pour etre percu par le model comme numeriques, il faut dont y remedier en les rendant de type object.

In [None]:
cols = ['YrSold','YearRemodAdd','YearBuilt','MoSold','MSSubClass','GarageYrBlt']
for col in cols:
    df[col]=df[col].astype(str)

D'un autre cote plusieurs variables categorielles ont un sense hierarchique, un rating Excellent n'est pas la meme chose qu'un rating Poor, il est donc important de remedier cela.

In [None]:
df["ExterQual_"] = df.ExterQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
df["BsmtQual_"] = df.BsmtQual.map({'None':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df["BsmtExposure_"] = df.BsmtExposure.map({'None':1, 'No':2, 'Av':3, 'Mn':3, 'Gd':4})
df["HeatingQC_"] = df.HeatingQC.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df["KitchenQual_"] = df.KitchenQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
df["FireplaceQu_"] = df.FireplaceQu.map({'None':1, 'Po':2, 'Fa':3, 'TA':4, 'Gd':5, 'Ex':6})
df["GarageFinish_"] = df.GarageFinish.map({'None':1, 'Unf':2, 'RFn':3, 'Fin':4})
df["PavedDrive_"] = df.PavedDrive.map({'N':1, 'P':2, 'Y':3})

###### Ajout de variables

In [None]:
df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']

# Distribution de Variables

In [None]:
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
# La ditribution est skewed des deux cotes, un peu plus du coté droit.

In [None]:
from scipy.stats import norm
sns.distplot(train.SalePrice,fit=norm)

# On peut le voir plus clairement ici.

In [None]:
skewness = pd.DataFrame({'Skew' :df[df.dtypes[df.dtypes != "object"].index].apply(lambda x : skew (x.dropna())).sort_values(ascending=False)})
skewness = skewness[abs(skewness) > 1] # On prend 0.75 comme threshold
print ("{} variables necessitent une transformation.".format(skewness.shape[0]))

In [None]:
# SalesPrices plot with three different fitted distributions
plt.figure(1); plt.title('Johnson')
sns.distplot(y, kde=False, fit=stats.johnsonsu)
plt.figure(2); plt.title('Normale')
sns.distplot(y, kde=False, fit=stats.norm)
plt.figure(3); plt.title('Log-Normale')
sns.distplot(y, kde=False, fit=stats.lognorm)

# Je n'ai pas pu trouver le fit pour une transformation box cox, pour cela je vais utiliser 
# la transformation Normale


In [None]:
y_trans = np.log(y)
skewness = df.select_dtypes(include=[np.number]).apply(lambda x: skew(x))
skewness_features = skewness[abs(skewness) >= 1].index
df[skewness_features] = np.log1p(df[skewness_features])

In [None]:
# Normalisation des Variables Numeriques
df_scaled= df
cols = ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1',
        'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 
        'Fireplaces', 'FullBath', 'GarageArea', 'GrLivArea',
        'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF',
        'MasVnrArea', 'MiscVal', 'OpenPorchSF', 'OverallCond', 'OverallQual',
        'PoolArea', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF','TotalSF']
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
df_scaled[cols] = robust_scaler.fit(df[cols]).transform(df[cols])

# Encodage des variables categorielles

In [None]:
labelencoder = LabelEncoder()
df_scaled['YrSold']=labelencoder.fit_transform(df_scaled['YrSold'])
df_scaled['YearRemodAdd']=labelencoder.fit_transform(df_scaled['YearRemodAdd'])
df_scaled['YearBuilt']=labelencoder.fit_transform(df_scaled['YearBuilt'])
df_scaled['MoSold']=labelencoder.fit_transform(df_scaled['MoSold'])
df_scaled['GarageYrBlt']=labelencoder.fit_transform(df_scaled['GarageYrBlt'])

In [None]:
df_scaled = pd.get_dummies(df_scaled)

In [None]:
X_train = df_scaled[:train.shape[0]]
X_test = df_scaled[train.shape[0]:]

print("train: {} \ntest: {} \ny: {}".format(X_train.shape,X_test.shape,y_trans.shape))
np.isnan(X_test.values).any()

# Extraction des Variables les plus importantes

In [None]:
# Utilisation du model de regression XGBoost pour la detection des variables les plus importantes

model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)

In [None]:
model.fit(X_train,y_trans)

In [None]:
OrderedDict(sorted(model._Booster.get_fscore().items(), key=lambda t: t[1], reverse=True))

In [None]:
best_vars= list( dict((k, v) for k, v in model._Booster.get_fscore().items() if v >= 10).keys())
print(best_vars)
X_train = X_train[best_vars]
X_test = X_test[best_vars]

# Entrainement de Modèles

In [None]:
def RMSLE (y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Fonction de Cross Validation

def RMSLE_CV(model):
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_trans, scoring="neg_mean_squared_error",
cv = kf))
    return(rmse)

In [None]:

# creating the models
models = [
             SVR(),
             xgboost.XGBRegressor(loss='huber', learning_rate=0.05, n_estimators=3000,min_samples_split=10, min_samples_leaf=15,max_depth=4,random_state=5,max_features='sqrt'),
             GradientBoostingRegressor(),
             RandomForestRegressor(),
             Lasso(alpha=0.01,max_iter=10000),
             Ridge(),
             BayesianRidge(),
             lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11),
             ElasticNet(),
             ElasticNet(alpha = 0.001,max_iter=10000),  
             ]

names = ['Support vector regression','XGBoost','Gradient boosting','Random Forest','Custom Lasso','Ridge','Bayesian Ridge','LightGBM','Elastic Net Regularization','Elastic Net Regularization Custom']



In [None]:
from sklearn.model_selection import KFold,cross_val_score
warnings.filterwarnings('ignore')

# Perform 5-folds cross-calidation to evaluate the models 
for model, name in zip(models, names):
    # Root mean square error
    score = RMSLE_CV(model)
    print("- {} : moyenne : {:.4f}, ecart-type : {:4f}".format(name, score.mean(),score.std()))

In [None]:
from sklearn.model_selection import GridSearchCV

class gridSearch():
    def __init__(self,model):
        self.model = model
    def grid_get(self,param_grid):
        grid_search = GridSearchCV(self.model,param_grid,cv=5,scoring='neg_mean_squared_error')
        grid_search.fit(X_train,y_trans)
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])
        print('\nBest parameters : {}, best score : {}'.format(grid_search.best_params_,np.sqrt(-grid_search.best_score_)))

In [None]:
gridSearch(ElasticNet()).grid_get(
        {'alpha':[0.006,0.0065,0.007,0.0075,0.008],'l1_ratio':[0.070,0.075,0.080,0.085,0.09,0.095],'max_iter':[10000]})

In [None]:
gridSearch(BayesianRidge()).grid_get(
        {'alpha_1':[0.01,0.001,0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0008,0.0009],'n_iter':[100000]})

In [None]:
gridSearch(SVR()).grid_get(
        {'C':[13,15,17,19,21],'kernel':['rbf'],'gamma':[0.0005,0.001,0.002,0.01],'epsilon':[0.01,0.02,0.03,0.1]})

In [None]:
gridSearch(Lasso()).grid_get(
       {'alpha':[0.01,0.001,0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0008,0.0009],'max_iter':[10000]})

In [None]:
gridSearch(GradientBoostingRegressor()).grid_get(
       {'learning_rate':[0.05,0.1,0.15,0.025,0.012],'n_estimators':[1000,2000,3000,4000,5000,6000],'loss':['ls', 'lad', 'huber', 'quantile']})

On choisit les meilleurs parametres de tous ces modeles, et definit les versions a utiliser.

In [None]:
elsnt = ElasticNet(alpha=0.006,l1_ratio=0.07,max_iter=100000)
bayes = BayesianRidge(alpha_1 = 0.0001, n_iter = 100000)
svr = SVR(C = 13, epsilon= 0.03, gamma = 0.001, kernel = 'rbf')
lasso = Lasso(alpha= 0.0005, max_iter= 100000)
GBoost = GradientBoostingRegressor(loss='huber', learning_rate=0.05, n_estimators=3000,
                                   min_samples_split=10, min_samples_leaf=15,max_depth=4,
                                   random_state=5,max_features='sqrt')
lgbm = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # Clonage des modeles afin de les fitter
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Entrainement des modeles
        for model in self.models_:
            model.fit(X, y)

        return self
    
    # On predit le resultat et on prend la moyenne de tous les modeles
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)
    

model_final = AveragingModels(models = (elsnt, bayes, lasso, GBoost))

score = RMSLE_CV(model_final)
print(" La moyenne des modeles est: {:.4f}".format(score.mean(), score.std()))

In [None]:
model_final.fit(X_train.values, y_trans) 
y_train_pred = model_final.predict(X_train.values)
print("Score du modele sur le train set:") 
print(RMSLE(y_trans,y_train_pred))

In [None]:
submission = pd.DataFrame()
submission['id'] = pd.read_csv("test.csv")['Id']
submission['SalePrice'] = np.exp(model_final.predict(X_test.values))
submission.to_csv('submission.csv', index = False)

submission.head()