In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV , train_test_split , cross_val_score
from sklearn.metrics import classification_report , confusion_matrix


from sklearn.linear_model import LogisticRegression


from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import os
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input/"))
# Any results you write to the current directory are saved as output.

['house-prices-advanced-regression-techniques']


In [2]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv' , index_col= 'Id')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv'  , index_col= 'Id')
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)
label = train[['SalePrice']]
train.drop('SalePrice' , axis = 1 , inplace=True)
train.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal


In [3]:
label.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Data columns (total 79 columns):
MSSubClass       1458 non-null int64
MSZoning         1458 non-null object
LotFrontage      1199 non-null float64
LotArea          1458 non-null int64
Street           1458 non-null object
Alley            91 non-null object
LotShape         1458 non-null object
LandContour      1458 non-null object
Utilities        1458 non-null object
LotConfig        1458 non-null object
LandSlope        1458 non-null object
Neighborhood     1458 non-null object
Condition1       1458 non-null object
Condition2       1458 non-null object
BldgType         1458 non-null object
HouseStyle       1458 non-null object
OverallQual      1458 non-null int64
OverallCond      1458 non-null int64
YearBuilt        1458 non-null int64
YearRemodAdd     1458 non-null int64
RoofStyle        1458 non-null object
RoofMatl         1458 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-

In [5]:
numerical_col = []
cat_col = []
for x in train.columns:
    if train[x].dtype == 'object':
        cat_col.append(x)
        print(x+': ' + str(len(train[x].unique())))
    else:
        numerical_col.append(x)
        
print('CAT col \n', cat_col)
print('Numerical col\n')
print(numerical_col)

MSZoning: 5
Street: 2
Alley: 3
LotShape: 4
LandContour: 4
Utilities: 2
LotConfig: 5
LandSlope: 3
Neighborhood: 25
Condition1: 9
Condition2: 8
BldgType: 5
HouseStyle: 8
RoofStyle: 6
RoofMatl: 7
Exterior1st: 15
Exterior2nd: 16
MasVnrType: 5
ExterQual: 4
ExterCond: 5
Foundation: 6
BsmtQual: 5
BsmtCond: 5
BsmtExposure: 5
BsmtFinType1: 7
BsmtFinType2: 7
Heating: 6
HeatingQC: 5
CentralAir: 2
Electrical: 6
KitchenQual: 4
Functional: 7
FireplaceQu: 6
GarageType: 7
GarageFinish: 4
GarageQual: 6
GarageCond: 6
PavedDrive: 3
PoolQC: 4
Fence: 5
MiscFeature: 5
SaleType: 9
SaleCondition: 6
CAT col 
 ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'Kitche

In [6]:
numerical_col.remove('MSSubClass')
cat_col.append('MSSubClass')

In [7]:
train_num = train[numerical_col]
train_num.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,548,0,61,0,0,0,0,0,2,2008
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,460,298,0,0,0,0,0,0,5,2007
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,608,0,42,0,0,0,0,0,9,2008
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,642,0,35,272,0,0,0,0,2,2006
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,836,192,84,0,0,0,0,0,12,2008


In [8]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
#imputer = Imputer(missing_values='NaN' , strategy='median' , axis = 0)
imputer = imputer.fit(train_num)
train_num = imputer.transform(train_num)

In [9]:
test_num = imputer.transform(test[numerical_col])

In [10]:
print(train_num.shape)
print(test_num.shape)

(1458, 35)
(1459, 35)


In [11]:
X_train , X_test , y_train , y_test=  train_test_split(train_num , label , test_size= 0.2 , random_state=123)

In [12]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression(normalize=True)
scores = cross_val_score(clf, X_train, y_train, cv=5).mean()
scores

0.8490223203167366

In [13]:
from sklearn.linear_model import Lasso
clf = Lasso(alpha=0.3, normalize=True)
scores = cross_val_score(clf, X_train, y_train, cv=5).mean()
scores

0.8490697977664062

In [14]:
from sklearn.linear_model import ElasticNet
clf = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
scores = cross_val_score(clf, X_train, y_train, cv=5).mean()
scores

0.8393453001148075

In [15]:
import xgboost
clf=xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
colsample_bytree=1, max_depth=7)
scores = cross_val_score(clf, X_train, y_train, cv=5).mean()
scores

0.8876227723410528

In [16]:
train_cat = train[cat_col]
test_cat = test[cat_col]
print(train_cat.info())
print(test_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Data columns (total 44 columns):
MSZoning         1458 non-null object
Street           1458 non-null object
Alley            91 non-null object
LotShape         1458 non-null object
LandContour      1458 non-null object
Utilities        1458 non-null object
LotConfig        1458 non-null object
LandSlope        1458 non-null object
Neighborhood     1458 non-null object
Condition1       1458 non-null object
Condition2       1458 non-null object
BldgType         1458 non-null object
HouseStyle       1458 non-null object
RoofStyle        1458 non-null object
RoofMatl         1458 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-null object
MasVnrType       1450 non-null object
ExterQual        1458 non-null object
ExterCond        1458 non-null object
Foundation       1458 non-null object
BsmtQual         1421 non-null object
BsmtCond         1421 non-null object
BsmtExposure     1420

In [17]:
dropp = ['MiscFeature' , 'PoolQC' , 'Fence' ,'Alley' ]
train_cat.drop(columns=dropp , axis=1, inplace=True)

In [18]:
train_cat = train_cat.astype('category')
print(train_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Data columns (total 40 columns):
MSZoning         1458 non-null category
Street           1458 non-null category
LotShape         1458 non-null category
LandContour      1458 non-null category
Utilities        1458 non-null category
LotConfig        1458 non-null category
LandSlope        1458 non-null category
Neighborhood     1458 non-null category
Condition1       1458 non-null category
Condition2       1458 non-null category
BldgType         1458 non-null category
HouseStyle       1458 non-null category
RoofStyle        1458 non-null category
RoofMatl         1458 non-null category
Exterior1st      1458 non-null category
Exterior2nd      1458 non-null category
MasVnrType       1450 non-null category
ExterQual        1458 non-null category
ExterCond        1458 non-null category
Foundation       1458 non-null category
BsmtQual         1421 non-null category
BsmtCond         1421 non-null category
BsmtExposure 

In [19]:
test_cat.drop(columns=dropp , axis=1, inplace=True)
test_cat = test_cat.astype('category')
test_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Data columns (total 40 columns):
MSZoning         1455 non-null category
Street           1459 non-null category
LotShape         1459 non-null category
LandContour      1459 non-null category
Utilities        1457 non-null category
LotConfig        1459 non-null category
LandSlope        1459 non-null category
Neighborhood     1459 non-null category
Condition1       1459 non-null category
Condition2       1459 non-null category
BldgType         1459 non-null category
HouseStyle       1459 non-null category
RoofStyle        1459 non-null category
RoofMatl         1459 non-null category
Exterior1st      1458 non-null category
Exterior2nd      1458 non-null category
MasVnrType       1443 non-null category
ExterQual        1459 non-null category
ExterCond        1459 non-null category
Foundation       1459 non-null category
BsmtQual         1415 non-null category
BsmtCond         1414 non-null category
BsmtExposu

In [20]:
most_freq = {}
for col in train_cat.columns:
    p = train_cat[col].mode()[0] 
    train_cat[col].fillna(p, inplace=True)
    most_freq[col] = p

In [21]:
for col in train_cat.columns:
    test_cat[col].fillna(most_freq[col], inplace=True)

In [22]:
print(test_cat.info())
print(train_cat.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Data columns (total 40 columns):
MSZoning         1459 non-null category
Street           1459 non-null category
LotShape         1459 non-null category
LandContour      1459 non-null category
Utilities        1459 non-null category
LotConfig        1459 non-null category
LandSlope        1459 non-null category
Neighborhood     1459 non-null category
Condition1       1459 non-null category
Condition2       1459 non-null category
BldgType         1459 non-null category
HouseStyle       1459 non-null category
RoofStyle        1459 non-null category
RoofMatl         1459 non-null category
Exterior1st      1459 non-null category
Exterior2nd      1459 non-null category
MasVnrType       1459 non-null category
ExterQual        1459 non-null category
ExterCond        1459 non-null category
Foundation       1459 non-null category
BsmtQual         1459 non-null category
BsmtCond         1459 non-null category
BsmtExposu

In [23]:
train_cat.head(2)

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition,MSSubClass
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Typ,Gd,Attchd,RFn,TA,TA,Y,WD,Normal,60
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal,20


In [24]:
test_cat.head(2)

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition,MSSubClass
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,RH,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,...,Typ,Gd,Attchd,Unf,TA,TA,Y,WD,Normal,20
1462,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,...,Typ,Gd,Attchd,Unf,TA,TA,Y,WD,Normal,20


In [25]:
train_num =pd.DataFrame(train_num)
train_num.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,150.0,...,548.0,0.0,61.0,0.0,0.0,0.0,0.0,0.0,2.0,2008.0
1,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,284.0,...,460.0,298.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0


In [26]:
test_num =pd.DataFrame(test_num)
test_num.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,80.0,11622.0,5.0,6.0,1961.0,1961.0,0.0,468.0,144.0,270.0,...,730.0,140.0,0.0,0.0,0.0,120.0,0.0,0.0,6.0,2010.0
1,81.0,14267.0,6.0,6.0,1958.0,1958.0,108.0,923.0,0.0,406.0,...,312.0,393.0,36.0,0.0,0.0,0.0,0.0,12500.0,6.0,2010.0


In [27]:
for col in train_cat:
    train_cat[col] = train_cat[col].cat.codes
for col in test_cat:
    test_cat[col] = test_cat[col].cat.codes

In [28]:
train_cat.head(2)

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition,MSSubClass
0,3,1,3,3,0,4,0,5,2,2,...,6,2,1,1,4,4,2,8,4,5
1,3,1,3,3,0,2,0,24,1,2,...,6,4,1,1,4,4,2,8,4,0


In [29]:
train_num.index = train_cat.index

In [30]:
test_num.index = test_cat.index

In [31]:
train_cat = pd.get_dummies(train_cat)
test_cat = pd.get_dummies(test_cat)

In [32]:
train_ = train_num.join(train_cat)

In [33]:
test_ = test_num.join(test_cat)

In [34]:
scalar = MinMaxScaler()
train_ = scalar.fit_transform(train_)
test_ = scalar.transform(test_)

In [35]:
# import xgboost
# clf=xgboost.XGBRegressor(n_estimators=1000, learning_rate=0.07, gamma=0, subsample=0.75,
# colsample_bytree=1, max_depth=7)
# scores = cross_val_score(clf, train_, label, cv=5).mean()
# scores

In [36]:
import lightgbm as lgb
lightgbm = lgb.LGBMRegressor(objective='regression', 
                                       num_leaves=8,
                                       learning_rate=0.03, 
                                       n_estimators=4000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )
scores = cross_val_score(lightgbm, train_, label, cv=5).mean()
scores

0.91299711137643

In [37]:
clf.fit(train_ , label)
pre = clf.predict(test_)
submit = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
submit.head()


submit.SalePrice = pre

submit.to_csv('submit.csv', index = False)

In [38]:
submit.head()

Unnamed: 0,Id,SalePrice
0,1461,128762.476562
1,1462,155308.84375
2,1463,186021.328125
3,1464,182738.75
4,1465,191706.0
