In [None]:


import pandas as pd
import numpy as np
import random as rnd
from scipy.stats import  skew,norm
import statsmodels.api as sm


# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

#Some styling
sns.set_style("darkgrid")
plt.style.use("fivethirtyeight")

import plotly.io as pio
pio.templates.default = "gridon"

#Subplots
from plotly.subplots import make_subplots

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#Importing data for training and testing

x_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
x_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
y_train = np.array(x_train['SalePrice'].copy())
#avoid preprocess data separately
all_data = [x_train,x_test]

In [None]:
# x_train.head()

In [None]:
# x_test.head()

In [None]:


# print('-Number of dimensions of training dataset',x_train.ndim)

# print('-Training dataset dimensions ',x_train.shape)

# print('-Training dataset columns names',x_train.columns.values)

# print(' Training dataset infos',x_train.info(verbose=True))



In [None]:
# print('-Number of dimensions of test dataset',x_test.ndim)

# print('-test dataset dimensions ',x_test.shape)

# print('-test dataset columns names',x_test.columns.values)

# print(' test dataset infos',x_test.info(verbose=True))

# 3 data types float64(11), int64(26), object(43)
# Some in variables has discrete values 
==> it would be better to change them to categorical variables
# Some features contains a lot of missing values 



In [None]:
x_train.drop(columns=['Id'],axis=1,inplace=True)
x_test.drop(columns=['Id'],axis=1,inplace=True)

In [None]:
#show duplicated rows
x_train[x_train.duplicated()]


In [None]:
#show duplicated rows

x_test[x_test.duplicated()]


# No duplicated rows in training and test dataset

In [None]:
# x_train.describe().T

# OverallCond 50 % of samples had 5 or less as rating 

In [None]:

# x_train.describe(include=['O']).T

# Some values of some feature are dominante
#  'Street : 1454 Pave of 1460 '

In [None]:
#, skewness measure of how much a random variable deviates from the normal distribution.

x_train.skew()

# LotArea MiscVal PoolArea LowQualFinSF 3SsnPorch : Are highly positively skewed
==>higher number of data points having low values ==>model will perform better at predicting lower
# ID is  perfectly symmetrical

In [None]:
# x_train.info()

In [None]:
#Since we have many feature, to make detecting missing values easier, we will visualize it in a plot 

def missing_val (df):
    list1 = list(df.isna().sum())
    lst= []
    i=0
    for col in df.columns:
        second_lst = [col,list1[i]]
        lst.append(second_lst)
        i+=1
    temp_df = pd.DataFrame(data=lst,columns=['Column_Name','Missing_Values'])
    temp_df = temp_df[temp_df['Missing_Values']>0]
    fig = px.bar(temp_df.sort_values(by='Missing_Values'),x='Missing_Values',y='Column_Name',
             orientation='h',height=1500,width=900,color='Missing_Values',text='Missing_Values',title='Missing values')
    fig.update_traces(textposition='outside')
    fig.show()


In [None]:
missing_val(x_train)

    


In [None]:
missing_val(x_test)

# Starting imputation of numerical features
If there are outliers in  a feature ==> fill them with their median values. if not then mean.
to know if there is outliers ==> boxplot

In [None]:

plt.figure(figsize=(15,5))

feat_check = ['LotFrontage','MasVnrArea','GarageYrBlt']
temp = x_train[feat_check]
colors=['','grey','blue','white']
i=1
for col in temp.columns:
    plt.subplot(1,3,i)
    a1 = sns.boxplot(data=temp,y=col,color=colors[i])
    i+=1

In [None]:
#filling the missing values with median
for col in feat_check:
    x_train[col].fillna(x_train[col].median(),inplace=True)
for col in feat_check:
    x_test[col].fillna(x_test[col].median(),inplace=True)

In [None]:

# x_train[feat_check].isna().sum()

In [None]:
# x_test[feat_check].isna().sum()

# Imputation of categorical features 

In [None]:
cat_missing = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageType','GarageFinish',
'GarageQual','GarageCond','BsmtFinType2','BsmtExposure','BsmtFinType1','BsmtCond','BsmtQual','MasVnrType', 'Electrical']

# PoolQC,MiscFeature,Alley and Fence contains more than 1k missing values 


In [None]:
# x_train['PoolQC'].value_counts().plot(kind='pie', autopct='%.2f')

 According to the dataset description : NA in PoolQC means No Pool available

In [None]:
x_train['PoolQC'] = x_train['PoolQC'].fillna('NA')
x_test['PoolQC'] = x_test['PoolQC'].fillna('NA')

In [None]:
# x_train['PoolQC'].value_counts()

In [None]:
# x_train['MiscFeature'].value_counts().plot(kind='pie', autopct='%.2f')

 According to the dataset description : NA in MiscFeature means No Miscellaneous feature available


In [None]:
x_train['MiscFeature'] = x_train['MiscFeature'].fillna('NA')
x_test['MiscFeature'] = x_test['MiscFeature'].fillna('NA')

In [None]:
# x_train['MiscFeature'].value_counts()

In [None]:
# x_train['Alley'].value_counts().plot(kind='pie', autopct='%.2f')

According to the dataset description : NA in Alley means No Alley access

In [None]:
x_train['Alley'] = x_train['Alley'].fillna('NA')
x_test['Alley'] = x_test['Alley'].fillna('NA')

In [None]:
# x_train['Alley'].value_counts()

In [None]:
# x_train['Fence'].value_counts().plot(kind='pie', autopct='%.2f')


According to the dataset description : NA in Fence means No Fence available

In [None]:
x_train['Fence'] = x_train['Fence'].fillna('NA')
x_test['Fence'] = x_test['Fence'].fillna('NA')


In [None]:
# x_train['Fence'].value_counts()

In [None]:
# x_train['FireplaceQu'].value_counts()

According to the dataset description : NA means no fireplace available


In [None]:
x_train['FireplaceQu'] = x_train['FireplaceQu'].fillna('NA')
x_test['FireplaceQu'] = x_test['FireplaceQu'].fillna('NA')

In [None]:
# x_train['FireplaceQu'].value_counts().plot(kind='pie', autopct='%.2f')

In [None]:
# x_train['GarageType'].value_counts().plot(kind='pie', autopct='%.2f')

 According to the dataset description : NA in GarageType, GarageFinish, GarageQual and GarageCond means no Garage available

In [None]:
fill_garage = ['GarageType','GarageFinish','GarageQual','GarageCond']

for i in fill_garage : 
    x_train[i] = x_train[i].fillna('NA')
    x_test[i] = x_test[i].fillna('NA')

In [None]:
# x_train[fill_garage].isna().sum()

In [None]:
# x_test[fill_garage].isna().sum()

In [None]:
# x_train['BsmtQual'].value_counts().plot(kind='pie', autopct='%.2f')


 According to the dataset description : NA in BsmtFinType2, BsmtExposure, BsmtFinType1, BsmtCond and BsmtQual means no Basement available

In [None]:
fill_Bsmt = ['BsmtFinType2','BsmtExposure','BsmtFinType1','BsmtCond','BsmtQual']

for i in fill_Bsmt : 
    x_train[i] = x_train[i].fillna('NA')
    x_test[i] = x_test[i].fillna('NA')

In [None]:
# x_train[fill_Bsmt].isna().sum()

In [None]:
# x_test[fill_Bsmt].isna().sum()

In [None]:
# x_train['MasVnrType'].value_counts().plot(kind='pie', autopct='%.2f')


In [None]:
# x_train[x_train['MasVnrType'].isnull()][['MasVnrType','MasVnrArea']]

samples with nan value for MasVnrType, have 0 MasVnrArea 
===> No Masonry veneer area 

In [None]:
x_train['MasVnrType'] = x_train['MasVnrType'].fillna('None')
x_test['MasVnrType'] = x_test['MasVnrType'].fillna('None')

In [None]:
# x_train['Electrical'].value_counts().plot(kind='pie', autopct='%.2f')


No information available about non values in Electrical feature in dataset description
Since it is cateogrical feature ==> fill them with most frequent value

In [None]:
x_train['Electrical'].fillna('SBrkr',inplace=True)
x_test['Electrical'].fillna('SBrkr',inplace=True)

In [None]:
#check if there any missing values left 

# x_train.isna().sum().sum()

In [None]:
# x_test.isna().sum().sum()

Stilll some missing values left in test dataset

In [None]:
missing_val(x_test)

In [None]:
miss_test = ['MSZoning', 'Utilities','BsmtFullBath','BsmtHalfBath','Functional','Exterior1st','Exterior2nd', 'BsmtFinSF1',
'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','KitchenQual','GarageCars','GarageArea','SaleType']
x_test[miss_test].info()

In [None]:
categorical = [i for i in miss_test if x_test[i].dtype=='O']
categorical

No information available about nan values in these features 

==> fill them with most frequent value

In [None]:
for i in categorical:
    x_test[i] = x_test[i].fillna(x_test[i].mode()[0])

In [None]:
x_test[categorical].isna().sum()

In [None]:
numerical = [i for i in miss_test if x_test[i].dtype!='O']
numerical

In [None]:
for i in numerical:
    x_test[i] = x_test[i].fillna(x_test[i].median())

In [None]:
x_test[numerical].isna().sum()

In [None]:
x_test.isna().sum().sum()

In [None]:
# x_train['SalePrice'].describe()

# The average  Sale price id 180921
# Sale price is between 34900 and 755000
# half of samples have a Sale price <= 163000

In [None]:
# figure, ax = plt.subplots(1,2, figsize = (20,8))
# sns.stripplot(data=x_train,  y='SalePrice', ax = ax[0])
# sns.boxplot(data=x_train,y='SalePrice', ax = ax[1])
# plt.show()

In [None]:
#     print('mean', np.round(np.mean(x_train['SalePrice']), 2))
#     print('median', np.round(np.median(x_train['SalePrice']), 2))

In [None]:
# Getting the main parameters of the Normal Ditribution ()
# (mu, sigma) = norm.fit(x_train['SalePrice'])





# plt.suptitle(" SalePrice distribution vs Normal Distribution", fontsize= 15)
# plt.xlabel("House's sale Price in $", fontsize = 12)
# plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)])

# sns.distplot(x_train['SalePrice'], kde = True, hist=True, fit = norm)
# plt.show()

# The distribution does not seem to be normal, but highly right-skewed

 Skewness = 1.882876 

In [None]:
# x_train2 = np.log1p(x_train['SalePrice'])

# plt.suptitle("qq-plot & distribution SalePrice ", fontsize= 15)

# sns.distplot(x_train2, kde = True, hist=True, fit = norm)
# plt.show()

# Log of SalePrice resembles a normal distribution


In [None]:
# x_train2.skew()

In [None]:
cat_summary = x_train.describe(include=[object]).transpose()
categorical_columns = cat_summary.index.tolist()
numerical_columns = x_train.select_dtypes(include=[np.number]).drop(columns=['SalePrice']).columns.tolist()
uniqueValCount=x_train[numerical_columns].nunique()
numerical_discrete=uniqueValCount[uniqueValCount<50].index.tolist()
date_columns = ['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold']
discrete_columns = [col for col in numerical_discrete if col not in date_columns]
continuous_columns = [col for col in numerical_columns if col not in discrete_columns+date_columns]

In [None]:
discrete_columns

In [None]:
# def plot_bar(df, columns):
#     cols = 3
#     rows = len(columns) // 3 + 1

#     plt.figure(figsize=(cols * 6.7, rows * 3.75))
#     i = 0
#     for row in range(rows):
#         for col in range(cols):
#             index = cols * row + col
#             if index >= len(columns):
#                 break
#             plt.subplot(rows, cols, index + 1,)
#             df.groupby(columns[i]).size().plot(
#                 kind="bar",
#             )
            
#             i += 1
#             plt.tight_layout() 

# # plotting box
# def plot_box(df, y, columns):
#     cols = 3
#     rows = len(columns) // 3 + 1

#     plt.figure(figsize=(cols * 5.5, rows * 3.5))
#     i = 0
#     for row in range(rows):
#         for col in range(cols):
#             index = cols * row + col
#             if index >= len(columns):
#                 break
#             plt.subplot(rows, cols, index + 1)
#             sns.boxplot(x=columns[i], y=y, data=df)

#             i += 1
#             plt.tight_layout() 
# # plotting strip plot
       
# def strip_plot(df, y, columns):
#     cols = 3
#     rows = len(columns) // 3 + 1

#     plt.figure(figsize=(cols * 5.5, rows * 3.5))
#     i = 0
#     for row in range(rows):
#         for col in range(cols):
#             index = cols * row + col
#             if index >= len(columns):
#                 break
#             plt.subplot(rows, cols, index + 1)
#             sns.stripplot(x=columns[i], y=y, data=df)

#             i += 1
#             plt.tight_layout() 
# # plotting hist
# def plot_hist(df, columns):
#     cols = 3
#     rows = len(columns) // 3 + 1

#     plt.figure(figsize=(cols * 5.5, rows * 3.5))
#     i = 0
#     for row in range(rows):
#         for col in range(cols):
#             index = cols * row + col
#             if index >= len(columns):
#                 break
#             plt.subplot(rows, cols, index + 1)
#             df[columns[i]].hist(bins=50)
#             plt.ylabel(columns[i])

#             i += 1
#             plt.tight_layout() 




In [None]:
# plot_bar(x_train,categorical_columns+date_columns+discrete_columns)

In [None]:
# strip_plot(x_train,'SalePrice',categorical_columns+date_columns+discrete_columns)

In [None]:
# plot_hist(x_train,continuous_columns)


In [None]:
# plot_box(x_train,'SalePrice',categorical_columns+date_columns+discrete_columns)


Street, Utilities,PoolQC,Poolarea not corr with saleprice since one value is the dominante value 

Pool in house doesn't increase price 
centralair in house increase price 
House's price increase with TotRmsAbvGrd,fullbath,overallqual,
garageCars,YearBUilt, GarageYrBlt,YearRemodadd


YrSold, moSold doesn't increase price

In [None]:
# fig, ax = plt.subplots(figsize=(20,7))
# sns.heatmap(x_train[numerical_columns+['SalePrice']].corr(), cmap='coolwarm', annot=True, annot_kws={'size':10}, )
# plt.show()

overallqual/grlivarea/garagecars/garagearea high correlation with target

enclosedporch kitchenabvgr low negative correlation with target 

 



In [None]:
# sns.regplot(x='OverallQual', y='SalePrice', data=x_train, robust=True)


it seems, there is a linear relationship between OverallQual and SalePrice


In [None]:
# sns.regplot(x='GrLivArea', y='SalePrice', data=x_train, robust=True)


it seems, there is a linear relationship between grlivarea and SalePrice


In [None]:
# sns.regplot(x='GarageArea', y='SalePrice', data=x_train, robust=True)


In [None]:
# sns.regplot(x='TotRmsAbvGrd', y='SalePrice', data=x_train, robust=True)



In [None]:
# sns.regplot(x='TotalBsmtSF', y='SalePrice', data=x_train, robust=True)


In [None]:
x_train['TotalSF'] = x_train['TotalBsmtSF'] + x_train['1stFlrSF'] + x_train['2ndFlrSF']
x_test['TotalSF'] = x_test['TotalBsmtSF'] + x_test['1stFlrSF'] + x_test['2ndFlrSF']


In [None]:
x_train['Total_Home_Quality'] = x_train['OverallQual'] + x_train['OverallCond']
x_test['Total_Home_Quality'] = x_test['OverallQual'] + x_test['OverallCond']

x_train['Total_Bathrooms'] = (x_train['FullBath'] + (0.5 * x_train['HalfBath']) +
                               x_train['BsmtFullBath'] + (0.5 * x_train['BsmtHalfBath']))

x_test['Total_Bathrooms'] = (x_test['FullBath'] + (0.5 * x_test['HalfBath']) +
                               x_test['BsmtFullBath'] + (0.5 * x_test['BsmtHalfBath']))

In [None]:
discrete_columns = discrete_columns + ['Total_Bathrooms'] +  ['Total_Home_Quality'] 

In [None]:
continuous_columns = continuous_columns +['TotalSF']


In [None]:
skewed_features  = x_train[continuous_columns +discrete_columns +date_columns+['SalePrice']].apply(lambda x: stats.skew(x)).sort_values(ascending=False)
skewed_features = skewed_features[abs(skewed_features )> 0.5]
skewed_features
skew_index = skewed_features.index

for i in skew_index:
    x_train[i] = np.log1p(x_train[i])


In [None]:
skew_index

In [None]:
skew_index_test = skew_index.drop('SalePrice')

In [None]:
for i in skew_index_test:
    x_test[i] = np.log1p(x_test[i])

In [None]:
fig, ax = plt.subplots(figsize=(20,7))
sns.heatmap(x_train[discrete_columns+ date_columns+continuous_columns+['SalePrice']].corr(method="spearman"), cmap='coolwarm', annot=True, annot_kws={'size':10}, )
plt.show()

In [None]:
categorical = []

for col in x_train.columns:
    if x_train[col].dtype=='O':
        categorical.append(col)

In [None]:
from sklearn.preprocessing import LabelEncoder  
le = LabelEncoder()

label_encoders = {}
for column in categorical:
    label_encoders[column] = LabelEncoder()
    x_train[column] = label_encoders[column].fit_transform(x_train[column])

In [None]:
from sklearn.preprocessing import LabelEncoder  
le = LabelEncoder()

label_encoders = {}
for column in categorical:
    label_encoders[column] = LabelEncoder()
    x_test[column] = label_encoders[column].fit_transform(x_test[column])

In [None]:

columns_x=x_train.columns
columns_x

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
x_train2 = x_train.drop(['SalePrice'], axis=1)
select_reg =  SelectKBest(k=40, score_func=f_regression)
select_reg.fit( x_train2, x_train['SalePrice'])               
X_train_housing_new = select_reg.transform(x_train2)
X_train_housing_new.shape


In [None]:
kept_features = pd.DataFrame({'columns': x_train2.columns,
                              'Kept': select_reg.get_support()})
kept_features[kept_features['Kept']==True]
new_x_train = x_train2.iloc[:,select_reg.get_support()]
new_x_train.columns


In [None]:
x_test= x_test.iloc[:,select_reg.get_support()]

In [None]:
# new_x_train2 = new_x_train.join(x_train['SalePrice'])
new_x_train2 = new_x_train
new_x_train2

In [None]:
columns_x = new_x_train2.columns

In [None]:
columns_x_test = x_test.columns
columns_x_test

In [None]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
new_x_train2=scaler.fit_transform(new_x_train2)

new_x_train2 = pd.DataFrame(new_x_train2,columns=[columns_x])

In [None]:
columns_test = x_test.columns

scaler=StandardScaler()
x_test=scaler.fit_transform(x_test)

x_test = pd.DataFrame(x_test,columns=[columns_test])

In [None]:
x_test.head()

In [None]:
# corr_mat = x_train2[continuous_columns+categorical_columns+date_columns].join(x_train['SalePrice']).corr(method='spearman')
# selected_continuous_columns = corr_mat['SalePrice'][abs(corr_mat['SalePrice'])>=0.5].index.tolist()
# nonselected_continuous_columns = corr_mat['SalePrice'][abs(corr_mat['SalePrice'])<0.5].index.tolist()
# selected_continuous_columns

In [None]:
         
# def calculateAnova(inpData,y, catCols, target):
#     inpData = inpData.join(y)
#     from scipy.stats import f_oneway
#     CatColumnList = []
#     for cat in catCols:
#         CatGroupList = inpData.groupby(cat)[target].apply(list)
#         anova = f_oneway(*CatGroupList)
#         if(anova[1]<0.05):
#             print('The column ', cat, ' is correlated with ', target, ' | P-Value: ',anova[1])
#             CatColumnList.append(cat)
#         else:
#             print('The column ', cat , ' is NOT correlated with ', target, ' | P-Value: ',anova[1])
    
#     return(CatColumnList)

In [None]:
# y = x_train.pop("SalePrice")
# selected_categorical_cols = calculateAnova(x_train,y,categorical_columns+ nonselected_continuous_columns  +date_columns+discrete_columns,'SalePrice')
# selected_categorical_cols

In [None]:
# selected_col = [col for col in X.columns \
#  if col in selected_continuous_columns + selected_categorical_cols]
# X = X[selected_col].copy()
# X_test = X_test[selected_col].copy()

In [None]:
# from sklearn.feature_selection import mutual_info_regression
# def make_mi_scores(X, y):
#     '''Estimate mutual information for a continuous target variable.'''
#     X = X.copy()
#     for colname in X.select_dtypes(["object", "category"]):
#         X[colname], _ = X[colname].factorize()
#     # All discrete features should now have integer dtypes
#     discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
#     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
#     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
#     mi_scores = mi_scores.sort_values(ascending=False)
#     return mi_scores
# mi_scores = make_mi_scores(X, y)
# mi_scores.head()
# def plot_mi_scores(scores):
#     scores = scores.sort_values(ascending=True)
#     width = np.arange(len(scores))
#     ticks = list(scores.index)
#     plt.figure(figsize=(10,15))
#     clrs = ['grey' if (x < 0.01) else 'blue' for x in scores ]
#     plt.barh(width, scores, color=clrs)
#     plt.yticks(width, ticks)
#     plt.title("Mutual Information Scores")
# plot_mi_scores(mi_scores)

In [None]:
X = new_x_train2
# X = new_x_train2.drop(columns=['SalePrice'],axis=1)

y= x_train['SalePrice']


In [None]:
X

In [None]:
selected_feat = X.columns
print(selected_feat)

In [None]:
X = X[selected_feat].reset_index(drop=True)

In [None]:
X

In [None]:
y


In [None]:
x_test = x_test[selected_feat].reset_index(drop=True)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LinearRegression
# lm = LinearRegression()
# lm.fit(X, y )
# Y_pred = lm.predict(x_test)
# acc_log = round(lm.score(X , y) * 100, 2)
# acc_log

In [None]:
# from sklearn.ensemble import GradientBoostingRegressor  #GBM algorithm
# from sklearn.model_selection import cross_val_score,GridSearchCV  #Additional scklearn functions and Performing grid search
# from sklearn.metrics import mean_absolute_error,mean_squared_error 

In [None]:
# param_test1 = {'n_estimators':range(1000,4000,1000)}
# gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.05, min_samples_split=10,min_samples_leaf=15,max_depth=4,max_features='sqrt',random_state=5), 
# param_grid = param_test1, cv=5,)
# gsearch1.fit(X, y)

# gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# gbm1 = GradientBoostingRegressor(random_state=5,n_estimators=1000,learning_rate=0.05,
#                                 max_depth=9,min_samples_split=17,max_features='sqrt',
#                                 min_samples_leaf=13,loss='huber')
# gbm1.fit(X,y)

# Y_pred = gbm1.predict(x_test)


# print ("Accuracy on train: {}".format(gbm1.score(X, y)))


In [None]:
from xgboost import XGBRegressor
xgb0 = XGBRegressor(n_estimators=1000, learning_rate=0.05, gamma=0, subsample=0.75,max_depth=7,random_state=5,
                   min_child_weight=1,colsample_bytree=0.8)
xgb0.fit(X,y)

Y_pred = xgb0.predict(x_test)


print ("Accuracy on train: {}".format(xgb0.score(X, y)))

In [None]:
Y_pred2 = np.exp(Y_pred)
Y_pred2

In [None]:
sample = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
sample

In [None]:
from IPython.display import HTML
sample['SalePrice'] = Y_pred2

sample.to_csv('submission.csv', index=False)

def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe which was saved with .to_csv method
create_download_link(filename='submission.csv')