# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



## Load Libraries and Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv("../input/train.csv",sep=',')
test = pd.read_csv("../input/test.csv")

## EDA

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
train['SalePrice'].describe()

In [None]:
sns.distplot(train['SalePrice'],kde=False)
plt.figure(figsize=(10,10))

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(x=train['LotArea'], y=train['SalePrice'],data=train)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(x=train['TotalBsmtSF'], y=train['SalePrice'],data=train)

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x=train['OverallQual'],y=train['SalePrice'])

In [None]:
data = pd.concat([train['SalePrice'], train['YearBuilt']], axis= 1)
f, ax = plt.subplots(figsize=(14,9))
fig = sns.boxplot(x=train['YearBuilt'], y='SalePrice',data=train)
fig.axis(ymin=0, ymax=800000)
plt.xticks(rotation=45);

In [None]:
test.head()

In [None]:
train.shape , test.shape

In [None]:
train.head()

In [None]:
train = train.drop(columns=['Id'])
train.head()

In [None]:
test = test.drop(columns=['Id'])
test.head()

In [None]:
sns.set_style("white")
sns.set_palette("husl")
f, ax = plt.subplots(figsize=(10,8))
sns.distplot(train['SalePrice'],color="b")
ax.xaxis.grid(False)
ax.set(ylabel='Frequency')
ax.set(xlabel='SalePrice')
ax.set(title='SalePrice Distribution')
sns.despine(trim=True,left=True)
plt.show()

In [None]:
#log transform of training data of SalePrice
train['SalePrice'] = np.log1p(train['SalePrice'])

In [None]:
#import statistics libraries
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

In [None]:
sns.set_style("white")
sns.set_palette("deep")
f, ax = plt.subplots(figsize=(10,8))
sns.distplot(train['SalePrice'],fit=norm, color="b");

(mu, sigma) = norm.fit(train['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu,sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f})'.format(mu,sigma)], loc='best')
ax.xaxis.grid(False)
ax.set(ylabel='Frequency')
ax.set(xlabel='SalePrice')
ax.set(title='SalePrice Distribution')
sns.despine(trim=True,left=True)
plt.show()


In [None]:
#Remove outliers
train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>4500) & (train['SalePrice']<3)].index, inplace=True)
train.reset_index(drop=True, inplace=True)

In [None]:
#Split features and labels
train_labels = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

#Concat train and test features in order to apply feature transformation pipeline to the entire dataset
all_features = pd.concat([train_features,test_features]).reset_index(drop=True)
all_features.shape

In [None]:
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    return dict_x


missing = percent_missing(all_features)
df_miss = sorted(missing.items(), key = lambda x: x[1], reverse = True)
print('Percent of missing values')
df_miss[0:10]

In [None]:
#Convert non-numeric predictors as strings
all_features['MSSubClass'] = all_features['MSSubClass'].apply(str)
all_features['YrSold'] = all_features['YrSold'].astype(str)
all_features['MoSold'] = all_features['MoSold'].astype(str)

In [None]:
def handle_missing(features):
    features['Functional'] = features['Functional'].fillna('Typ')
    features['Electrical'] = features['Electrical'].fillna('SBrkr')
    features['KitchenQual'] = features['KitchenQual'].fillna('TA')
    features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
    features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
    features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
    features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
    features['PoolQC'] = features['PoolQC'].fillna("None")
    for col in ('GarageYrBlt', 'GarageArea','GarageCars'):
        features[col] = features[col].fillna(0)
    for col in ['GarageType','GarageFinish','GarageQual','GarageCond']:
        features[col] = features[col].fillna('None')
    features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    objects = []
    for i in features.columns:
        if features[i].dtype  == object:
            objects.append(i)
    features.update(features[objects].fillna('None'))
    numeric_dtypes = ['int16', 'int32','int64','float16','float32','float64']
    numeric = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numeric.append(i)
    features.update(features[numeric].fillna(0))
    return features

all_features = handle_missing(all_features)
    

In [None]:
missing = percent_missing(all_features)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:10]

In [None]:
numeric_dtypes = ['int16', 'int32','int64','float16','float32','float64']
numeric = []
for i in all_features.columns:
    if all_features[i].dtype in numeric_dtypes:
        numeric.append(i)

In [None]:
sns.set_style('white')
f, ax = plt.subplots(figsize=(10,8))
ax.set_xscale("log")
ax = sns.boxplot(data =all_features[numeric], orient='h',palette='Set1')
ax.xaxis.grid(False)
ax.set(ylabel="Feature Names")
ax.set(xlabel="Numeric Values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
# Find skewed numeric features
skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

print('There are {}  numerical features with Skew > 0.5:'.format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew': high_skew})
skew_features.head(10)

In [None]:
#Normalize skew features
for i in skew_index:
    all_features[i] = boxcox1p(all_features[i], boxcox_normmax(all_features[i] + 1))    

In [None]:
sns.set_style('white')
f, ax = plt.subplots(figsize=(10,8))
ax.set_xscale("log")
ax = sns.boxplot(data =all_features[skew_index], orient='h',palette='Set1')
ax.xaxis.grid(False)
ax.set(ylabel="Feature Names")
ax.set(xlabel="Numeric Values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)
plt.show()

## Feature Engineering 

In [None]:
all_features['BsmtFinType1_Unf'] = 1*(all_features['BsmtFinType1'] == 'Unf')
all_features['HasWoodDeck'] = (all_features['WoodDeckSF'] == 0) * 1 
all_features['HasOpenPorch'] = (all_features['OpenPorchSF'] == 0) * 1
all_features['HasEnclosedPorch'] = (all_features['EnclosedPorch'] == 0) * 1
all_features['Has3SsnPorch'] = (all_features['3SsnPorch'] == 0) * 1
all_features['HasScreenPorch'] = (all_features['ScreenPorch'] == 0) * 1
all_features['YearsSinceRemodel'] = all_features['YrSold'].astype(int) - all_features['YearRemodAdd'].astype(int)
all_features['Total_Home_Quality'] = all_features['OverallQual'] + all_features['OverallCond']
all_features = all_features.drop(['Utilities','Street','PoolQC'], axis= 1)
all_features['TotalSF'] = all_features['TotalBsmtSF'] + all_features['1stFlrSF'] + all_features['2ndFlrSF']
all_features['YrBltAndRemod'] = all_features['YearBuilt'] + all_features['YearRemodAdd']


In [None]:
all_features['Total_sqr_footage'] = (all_features['BsmtFinSF1'] + all_features['BsmtFinSF2']
                                    + all_features['1stFlrSF'] + all_features['2ndFlrSF'])
all_features['Total_Bathrooms'] = (all_features['FullBath'] + (0.5 * all_features['HalfBath'])
                                  + all_features['BsmtFullBath'] + (0.5 * all_features['BsmtHalfBath']))
all_features['Total_porch_sf'] = (all_features['OpenPorchSF'] + all_features['3SsnPorch']
                                 + all_features['EnclosedPorch'] + all_features['ScreenPorch']
                                 + all_features['WoodDeckSF'])
all_features['TotalBsmtSF'] = all_features['TotalBsmtSF'].apply(lambda x: np.exp(6) if x  <= 0.0 else x)
all_features['2ndFlrSF'] = all_features['2ndFlrSF'].apply(lambda x: np.exp(6.5) if x  <= 0.0 else x)
all_features['GarageArea'] = all_features['GarageArea'].apply(lambda x: np.exp(6) if x  <= 0.0 else x)
all_features['GarageCars'] = all_features['GarageCars'].apply(lambda x: 0 if x  <= 0.0 else x)
all_features['LotFrontage'] = all_features['LotFrontage'].apply(lambda x: np.exp(4.2) if x  <= 0.0 else x)
all_features['MasVnrArea'] = all_features['MasVnrArea'].apply(lambda x: np.exp(4) if x  <= 0.0 else x)
all_features['BsmtFinSF1'] = all_features['BsmtFinSF1'].apply(lambda x: np.exp(6.5) if x  <= 0.0 else x)

In [None]:
all_features['haspool'] = all_features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['has2ndfloor'] = all_features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasgarage'] = all_features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasbsmt'] = all_features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasfireplace'] = all_features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
all_features.columns

In [None]:
def logs(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol = pd.Series(np.log(1.01+res[l])).values)
        res.columns.values[m] = l + '_log'
        m += 1
    return res

log_features = ['LotFrontage', 'LotArea', 'MasVnrArea','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
               '1stFlrSF','2ndFlrSF','LowQualFinSF', 'GrLivArea','BsmtFullBath', 'BsmtHalfBath','FullBath', 
                'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr','TotRmsAbvGrd','Fireplaces', 'GarageCars', 'GarageArea',
                'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea','MiscVal','YearRemodAdd',
                'TotalSF']
all_features = logs(all_features,log_features)

In [None]:
all_features = pd.get_dummies(all_features).reset_index(drop=True)
all_features.shape

In [None]:
all_features.head()

In [None]:
all_features = all_features.loc[:,~all_features.columns.duplicated()]

In [None]:
X = all_features.iloc[:len(train_labels),:]
X_test = all_features.iloc[len(train_labels):, :]
X.shape , train_labels.shape , X_test.shape

In [None]:
#Import sklearn libraries
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.linear_model import ElasticNet,ElasticNetCV,Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, mean_squared_error
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline

In [None]:
#Models
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
#Using the Cross-Validation Score
n_folds = 10
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)

def rmlse(y, y_pred):
    return np.sqrt(mean_squared_error(y,y_pred))

def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model,X, train_labels  , scoring="neg_mean_squared_error", cv = kf))
    return (rmse)

## LASSO Regression

In [None]:
from sklearn.linear_model import Lasso, LassoLarsIC

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [None]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [None]:
model_xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon= 0.008, gamma = 0.0003))

In [None]:
rf = RandomForestRegressor(n_estimators = 1200,
                          max_depth = 15,
                          min_samples_split = 5,
                          min_samples_leaf = 5,
                          max_features = None,
                          oob_score = True,
                          random_state =100)

In [None]:
stack_gen = StackingCVRegressor(regressors =(model_xgb, model_lgb, svr, KRR, GBoost,rf),
                               meta_regressor = model_xgb,
                               use_features_in_secondary =True)

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
scores = {}
score = rmse_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
scores['lasso'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
scores['Enet'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
scores['krr'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
scores['gboost'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(svr)
print("SVR score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())

In [None]:
score = rmse_cv(rf)
print("Random Forest score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, train_labels):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, train_labels)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [None]:
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))

score = rmse_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(train_labels))

In [None]:
print('lightgbm')
lgb_model_full = model_lgb.fit(X, train_labels)

In [None]:
print('xgboost')
xgb_model_full = model_xgb.fit(X, train_labels)

In [None]:
print('SVR')
svr_model_full = svr.fit(X, train_labels)

In [None]:
print('Ridge')
ridge_model_full = KRR.fit(X, train_labels)

In [None]:
print('Random Forest')
rf_model_full = rf.fit(X, train_labels)

In [None]:
print('Gradient Boosting')
gbr_model_full = GBoost.fit(X, train_labels)

In [None]:
#Blend models to make final predictions robust to overfitting

def blended_prediction(X):
    return((0.1 * ridge_model_full.predict(X)) + \
            (0.2 * svr_model_full.predict(X)) + \
            (0.1 * gbr_model_full.predict(X)) + \
            (0.1 * xgb_model_full.predict(X)) + \
            (0.1 * lgb_model_full.predict(X)) + \
            (0.05 * rf_model_full.predict(X)) + \
            (0.35 * stack_gen_model.predict(np.array(X))))

In [None]:
# Get final precitions from the blended model
blended_score = rmlse(train_labels, blended_prediction(X))
scores['blended'] = (blended_score, 0)
print('RMSLE score on train data:')
print(blended_score)

In [None]:
# Plot the predictions for each model
sns.set_style("white")
fig = plt.figure(figsize=(25, 18))

ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
    ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')

plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)

plt.title('Scores of Models', size=20)

plt.show()

In [None]:
submission = pd.read_csv("../input/sample_submission.csv")

In [None]:
submission.shape

In [None]:
# Append predictions from blended models
submission.iloc[:,1] = np.floor(np.expm1(blended_prediction(X_test)))

In [None]:
# Fix outlier predictions
q1 = submission['SalePrice'].quantile(0.0045)
q2 = submission['SalePrice'].quantile(0.99)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)
submission.to_csv("submission_pricesv1.csv", index=False)

In [None]:
# Scaling predictions
submission['SalePrice'] *= 1.001619
submission.to_csv(r"C\Users\Aziz\Desktop\submission_prices.csv", index=False)