In [1]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as pp
import sklearn.model_selection as ms
import sklearn.metrics as metrics

## Load Data Sets

In [2]:
df = pd.read_csv('./data/hp/train.csv')
df = df.set_index('Id')

sdf = pd.read_csv('./data/hp/test.csv')
sdf = sdf.set_index('Id')
df.head()
print (df.columns.values)

['MSSubClass' 'MSZoning' 'LotFrontage' 'LotArea' 'Street' 'Alley'
 'LotShape' 'LandContour' 'Utilities' 'LotConfig' 'LandSlope'
 'Neighborhood' 'Condition1' 'Condition2' 'BldgType' 'HouseStyle'
 'OverallQual' 'OverallCond' 'YearBuilt' 'YearRemodAdd' 'RoofStyle'
 'RoofMatl' 'Exterior1st' 'Exterior2nd' 'MasVnrType' 'MasVnrArea'
 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond' 'BsmtExposure'
 'BsmtFinType1' 'BsmtFinSF1' 'BsmtFinType2' 'BsmtFinSF2' 'BsmtUnfSF'
 'TotalBsmtSF' 'Heating' 'HeatingQC' 'CentralAir' 'Electrical' '1stFlrSF'
 '2ndFlrSF' 'LowQualFinSF' 'GrLivArea' 'BsmtFullBath' 'BsmtHalfBath'
 'FullBath' 'HalfBath' 'BedroomAbvGr' 'KitchenAbvGr' 'KitchenQual'
 'TotRmsAbvGrd' 'Functional' 'Fireplaces' 'FireplaceQu' 'GarageType'
 'GarageYrBlt' 'GarageFinish' 'GarageCars' 'GarageArea' 'GarageQual'
 'GarageCond' 'PavedDrive' 'WoodDeckSF' 'OpenPorchSF' 'EnclosedPorch'
 '3SsnPorch' 'ScreenPorch' 'PoolArea' 'PoolQC' 'Fence' 'MiscFeature'
 'MiscVal' 'MoSold' 'YrSold' 'SaleType' '

In [3]:
y = df.SalePrice
print("Average sale price: " + "${:,.0f}".format(y.mean()))

Average sale price: $180,921


## Combine test and train for preprocessing

In [4]:
df = df.drop('SalePrice', axis=1)
all_df = df.append(sdf)
all_df.shape

(2919, 79)

## Create lists of categorical vs numeric features

In [5]:
all_features = list(df.columns.values)
numeric_features = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','TotalBsmtSF','Fireplaces', 'GarageCars', 'GarageArea','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
#numeric_features = list(df.select_dtypes(include=[np.number]).columns.values)
categorical_features = [f for f in all_features if not(f in numeric_features)]

(len(all_features), len(categorical_features), len(numeric_features))

(79, 53, 26)

## Preprocess numerical columns

In [6]:
numeric_df = all_df[numeric_features]
numeric_df.shape

(2919, 26)

### Impute

In [7]:
X = numeric_df.as_matrix()

imp = pp.Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp = imp.fit(X)
X = imp.transform(X)
X.shape

(2919, 26)

## Scale

In [8]:
scaler = pp.StandardScaler()
#Todo: Fit and transform data using scaler


X[0, :]

array([-0.12767816, -0.21787869,  0.52903417,  0.58114542, -0.29302528,
       -0.93416481, -0.10119702,  0.41354722,  1.08733408, -0.24971861,
        0.78136581,  1.23259887,  0.16992709, -0.20769847,  0.98684937,
       -0.44307791, -0.92431121,  0.30641785,  0.34936362, -0.74076041,
        0.20000601, -0.35960075, -0.10333053, -0.28593546, -0.06315017,
       -0.08959196])

## Expand categorical into columns
Much of this code is reorganization of the dataframe to include binary categories

In [9]:
def process_categorical(ndf, df, categorical_features):
    for f in categorical_features:
        new_cols = pd.DataFrame(pd.get_dummies(df[f]))
        new_cols.index = df.index
        ndf = pd.merge(ndf, new_cols, how = 'inner', left_index=True, right_index=True)
    return ndf

numeric_df = pd.DataFrame(X)
numeric_df.index = all_df.index
combined_df = process_categorical(numeric_df, all_df, categorical_features)
print(df['SaleCondition'].head())
print (set(df["SaleCondition"].values))
combined_df[['AdjLand', 'Family', 'Partial', 'Abnorml', 'Normal', 'Alloca']].head()

Id
1     Normal
2     Normal
3     Normal
4    Abnorml
5     Normal
Name: SaleCondition, dtype: object
{'Normal', 'Abnorml', 'Partial', 'Alloca', 'Family', 'AdjLand'}


Unnamed: 0_level_0,AdjLand,Family,Partial,Abnorml,Normal,Alloca
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
X = combined_df.as_matrix()
X.shape

(2919, 2330)

## PCA

In [11]:
#PCA
from sklearn.decomposition import PCA

test_n = df.shape[0]
x = X[:test_n,:]

pca = PCA()
#Todo: Fit and transform X using PCA (function params: training data and labels)


X.shape

(2919, 1460)

## Split Data Sets

In [12]:
x_test = X[test_n:,:]
#Todo: split training data up into training and validation sets


## Model testing

In [13]:
from sklearn import linear_model

lr = linear_model.LinearRegression()
lr.fit(x_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
# Todo: train ridge model (linear_model.Ridge)
ridge =


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

## Scoring

In [15]:
print('Linear Regression score is %f' % lr.score(x_val, y_val))
print('Ridge score is %f' % ridge.score(x_val, y_val))

Linear Regression score is 0.449709
Ridge score is 0.723280


## Write Submission Files

In [None]:
y_submit = classifier.predict(x_test)
y_submit[y_submit < 0] = 1.
sdf['SalePrice'] = y_submit
sdf.to_csv('./submission.csv', columns = ['SalePrice'])