In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.special import boxcox1p
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso, LassoCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv("/Users/aethertsai/Desktop/python/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/Users/aethertsai/Desktop/python/house-prices-advanced-regression-techniques/test.csv")
submit = pd.read_csv("/Users/aethertsai/Desktop/python/house-prices-advanced-regression-techniques/sample_submission.csv")
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [4]:
train = train[train['GrLivArea'] < 4000]

In [5]:
houses=pd.concat([train,test], sort=False)

In [6]:
category = [var for var in houses.columns if houses[var].dtype == "O"]
numerical = [var for var in houses.columns if houses[var].dtype != "O"]

In [7]:
houses.select_dtypes(include='object').head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [8]:
houses.select_dtypes(include=['float','int']).head()


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706.0,...,0,61,0,0,0,0,0,2,2008,208500.0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978.0,...,298,0,0,0,0,0,0,5,2007,181500.0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486.0,...,0,42,0,0,0,0,0,9,2008,223500.0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216.0,...,0,35,272,0,0,0,0,2,2006,140000.0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655.0,...,192,84,0,0,0,0,0,12,2008,250000.0


In [9]:
houses.select_dtypes(include='object').isnull().sum()[houses.select_dtypes(include='object').isnull().sum()>0]

MSZoning           4
Alley           2717
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinType2      80
Electrical         1
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageFinish     159
GarageQual       159
GarageCond       159
PoolQC          2907
Fence           2345
MiscFeature     2810
SaleType           1
dtype: int64

In [10]:
houses = houses.drop(columns = ["Alley"])
category.remove("Alley")

In [11]:
for col in category:
    if houses[col].isnull().sum() > 30:
        houses[col]=houses[col].fillna('None')

In [12]:
houses.describe(include = "O")

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,2911,2915,2915,2915,2913,2915,2915,2915,2915,2915,...,2915,2915,2915,2915,2915,2915.0,2915.0,2915.0,2914,2915
unique,5,2,4,4,2,5,3,25,9,8,...,7,4,6,6,3,4.0,5.0,5.0,9,6
top,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal
freq,2261,2903,1859,2620,2912,2132,2774,443,2509,2886,...,1720,1230,2600,2650,2637,2907.0,2345.0,2810.0,2523,2401


In [13]:
for col in category:
    if houses[col].isnull().sum() < 30 and houses[col].isnull().sum() > 0:
        houses[col] = houses[col].fillna(houses[col].value_counts().idxmax())
        

In [14]:
houses.select_dtypes(include=['int','float']).isnull().sum()[houses.select_dtypes(include=['int','float']).isnull().sum()>0]


LotFrontage      486
MasVnrArea        23
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
GarageYrBlt      159
GarageCars         1
GarageArea         1
SalePrice       1459
dtype: int64

In [15]:
for col in (
    'MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
    'BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea'):
    houses[col] = houses[col].fillna(0)

In [16]:
houses['LotFrontage'] = houses['LotFrontage'].fillna(houses['LotFrontage'].mean())


In [3]:
#for i in numerical:
#    fig, axis = plt.subplots()
#    fig.set_size_inches(20,10)
#    plt.scatter(houses[i],houses["SalePrice"])
#    plt.xlabel(i)
#    plt.ylabel("SalePrice")

In [4]:
#for i in numerical:
#    fig, axis = plt.subplots()
#    fig.set_size_inches(10,20)
#    fit = sns.boxplot(y = houses[i])
#    fit.set_xlabel(i)

In [5]:
#for i in numerical:
#    fig, axis = plt.subplots()
#    fig.set_size_inches(20,10)
#    fit = sns.distplot(houses[i].dropna())
#    fit.set_xlabel(i)
    

In [6]:
#plt.figure(figsize=[30,15])
#sns.heatmap(houses.corr(), annot=True)

In [21]:
houses.drop(['GarageArea','1stFlrSF','TotRmsAbvGrd','2ndFlrSF'], axis=1, inplace=True)

In [22]:
houses['MSSubClass']=houses['MSSubClass'].astype(str)

In [23]:
skew_All=houses.select_dtypes(include=['int','float']).apply(
    lambda x: skew(x.dropna())).sort_values(ascending=False)

In [24]:
skew_df=pd.DataFrame({'Skew':skew_All})
skewed_df=skew_df[(skew_df['Skew']>0.5)|(skew_df['Skew']<0.5)]

In [25]:
skewed_df.index

Index(['MiscVal', 'PoolArea', 'LotArea', 'LowQualFinSF', '3SsnPorch',
       'KitchenAbvGr', 'BsmtFinSF2', 'EnclosedPorch', 'ScreenPorch',
       'BsmtHalfBath', 'MasVnrArea', 'OpenPorchSF', 'WoodDeckSF', 'SalePrice',
       'LotFrontage', 'GrLivArea', 'BsmtFinSF1', 'BsmtUnfSF', 'Fireplaces',
       'HalfBath', 'TotalBsmtSF', 'BsmtFullBath', 'OverallCond',
       'BedroomAbvGr', 'MoSold', 'OverallQual', 'FullBath', 'YrSold', 'Id',
       'GarageCars', 'YearRemodAdd', 'YearBuilt', 'GarageYrBlt'],
      dtype='object')

In [26]:
lam=0.1
for col in ('MiscVal', 'PoolArea', 'LotArea', 'LowQualFinSF', '3SsnPorch',
       'KitchenAbvGr', 'BsmtFinSF2', 'EnclosedPorch', 'ScreenPorch',
       'BsmtHalfBath', 'MasVnrArea', 'OpenPorchSF', 'WoodDeckSF',
       'LotFrontage', 'GrLivArea', 'BsmtFinSF1', 'BsmtUnfSF', 'Fireplaces',
       'HalfBath', 'TotalBsmtSF', 'BsmtFullBath', 'OverallCond', 'YearBuilt',
       'GarageYrBlt'):
    houses[col]=boxcox1p(houses[col],lam)
    

In [27]:
houses=pd.get_dummies(houses)

In [28]:
Train = houses[pd.notnull(houses["SalePrice"])].sort_values(by = ["Id"])
Test = houses[~pd.notnull(houses["SalePrice"])].sort_values(by = ["Id"])


In [29]:
Train.drop('Id', axis=1, inplace=True)
Test.drop('Id', axis=1, inplace=True)

In [30]:
x = Train.drop(columns = "SalePrice")
y = Train['SalePrice']
Test = Test.drop(columns = ["SalePrice"])

In [31]:
sc=RobustScaler()
x=sc.fit_transform(x)
Test=sc.transform(Test)

In [32]:
Train['SalePrice']=np.log(Train['SalePrice'])

In [33]:
model=Lasso(alpha =0.001, random_state=1)

In [34]:
model.fit(x,y)


Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1,
      selection='cyclic', tol=0.0001, warm_start=False)

In [35]:
pred=model.predict(Test)
preds=np.exp(pred)

In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
regr = LinearRegression()

In [38]:
submit['SalePrice'] = preds
submit['SalePrice'] = submit['SalePrice'].astype(int)
submit.to_csv('submit.csv', index= False)