### 1. Data Loading

In [75]:
import pandas as pd

df_train = pd.read_csv('data/house_prices/train.csv')
df_test = pd.read_csv('data/house_prices/test.csv')
df_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### 2. Feature Engineering

In [76]:
# drop features by unique value
df = df_train.copy()
df_unique = df.loc[:, df.nunique(dropna=True)>1]
print('selected features: {}/{}'.format(df_unique.shape[1], df.shape[1]))

selected features: 81/81


In [77]:
# drop features by missing value proportion
df = df_unique.copy()
df_null = df.loc[:, df.isnull().mean()<0.3]
print('selected features: {}/{}'.format(df_null.shape[1], df.shape[1]))

selected features: 76/81


In [78]:
# imputation
df = df_null.copy()
df_imp = df.fillna(df.median(), inplace=False)
print('filled values: {}'.format(df.isnull().sum().sum()))

filled values: 868


In [79]:
# label encoder
from sklearn import preprocessing

df_le = df_imp.copy()
df_object = df_le.select_dtypes(include=['object']).fillna('NA')
le = preprocessing.LabelEncoder()
for col in df_object.columns:
    le.fit(df_object[col])
    df_le[col] = le.transform(df_object[col])
print('encoded features: {}/{}'.format(df_object.shape[1], df.shape[1]))

encoded features: 38/76


In [80]:
# train/test data splitting
from sklearn.model_selection import train_test_split

df = df_le.copy()
X = df[df.columns.difference(['SalePrice'])]
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.to_csv('data/house_prices/X_train.csv', index=False)
X_test.to_csv('data/house_prices/X_test.csv', index=False)
y_train.to_csv('data/house_prices/y_train.csv', index=False)
y_test.to_csv('data/house_prices/y_test.csv', index=False)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


### 3. Modeling

In [81]:
# linear regression
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [82]:
import math
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

r2_variance_weighted = r2_score(y_test, y_pred, multioutput='variance_weighted')
r2_uniform_average = r2_score(y_test, y_pred, multioutput='uniform_average')
print('R squared:{:.2f}'.format(r2_variance_weighted))
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
print('root mean square error: {:.2f}'.format(rmse))

R squared:0.81
root mean square error: 37590.67
