In [1]:
# Data Analysis Libraries
import csv
import numpy as np
import pandas as pd

# Machine Learning Libraries
from sklearn.preprocessing import LabelEncoder, power_transform
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [4]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']))

In [5]:
for i in all_data.columns.values:
    if all_data[i].dtypes != "object":
        # If the data is 10% of it then we can categorize it and make it to str type.
        if (len(all_data[i].unique()) < (len(all_data[i])/10)):
            all_data[i] = all_data[i].astype(str)

        if all_data[i].isna().sum() > 0:
            all_data[i] = all_data[i].fillna(0)

    elif all_data[i].dtypes == "object":
        if all_data[i].isna().sum() > 0:
            all_data[i] = all_data[i].replace({np.nan: "None"})        

In [6]:
for i in all_data.columns.values:
    if len(all_data[i].unique()) < 50:
        print (all_data[i].unique())
    else:
        print(all_data[i].dtypes)

['60' '20' '70' '50' '190' '45' '90' '120' '30' '85' '80' '160' '75' '180'
 '40' '150']
['RL' 'RM' 'C (all)' 'FV' 'RH' 'None']
object
int64
['Pave' 'Grvl']
['None' 'Grvl' 'Pave']
['Reg' 'IR1' 'IR2' 'IR3']
['Lvl' 'Bnk' 'Low' 'HLS']
['AllPub' 'NoSeWa' 'None']
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
['Gtl' 'Mod' 'Sev']
['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
['7' '6' '8' '5' '9' '4' '10' '3' '1' '2']
['5' '8' '6' '7' '4' '2' '3' '9' '1']
object
object
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Memb

In [7]:
for c in all_data.columns.values:
    if all_data[c].dtypes == "object":
        lbl = LabelEncoder()
        lbl.fit(list(all_data[c].values)) 
        all_data[c] = lbl.transform(list(all_data[c].values))

In [8]:
# apply log transform to target
train["SalePrice"] = np.log1p(train["SalePrice"])

In [9]:
# Split the total data into train and test data set.
train_x = all_data[:train.shape[0]]
test_x = all_data[train.shape[0]:]
train_y = train.SalePrice

In [10]:
# Find average RMSE using k-fold cross validation
def rmse_cv(model, cv=5):
    rmse = np.sqrt(-cross_val_score(model, train_x, train_y, scoring="neg_mean_squared_error", cv = cv))
    return rmse

In [15]:
# Get RMSE for linear regression model
lr = LinearRegression()
rmse = rmse_cv(lr)
print("RMSE estimate: {}, std: {}".format(rmse.mean(), rmse.std()))
lrm = LinearRegression().fit(train_x, train_y)
lrm_price_prediction = np.expm1(lrm.predict(test_x))

RMSE estimate: 0.15230955945050031, std: 0.024829105944629358


In [16]:
with open('house_price_submission.csv', 'w') as outcsv:
    writer = csv.writer(outcsv)
    writer.writerow(["Id", "SalePrice"])
    writer.writerows(zip(test['Id'].tolist(), lrm_price_prediction))