In [None]:
%matplotlib inline

In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import skew
import seaborn as sns
from sklearn import metrics

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
train.head()

In [None]:
train.describe()

## Correlation plot

In [None]:
corr = train.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Using that correlation plot we can find which variables are more important for the price prediction
We can also use it to drop features and avoid multicollinearity, this improves the accuracy of the model

## Feature selection

In [None]:
# remove outliers

train = train[~((train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000))]

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

# drop some features to avoid multicollinearity
all_data.drop(['1stFlrSF', 'GarageArea', 'TotRmsAbvGrd'], axis=1, inplace=True)

y_true = train.SalePrice.values
train["SalePrice"] = np.log1p(train["SalePrice"])

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.65]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)

all_data = all_data.fillna(all_data.mean())

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

## Model training

Go to the Lasso documentation (http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) and change the hyperparameters of the model

In [None]:
## Modify this cell with your model and hyperparameters


In [None]:
model.fit(X_train, y)

In [None]:
preds = np.expm1(model.predict(X_train))
preds

In [None]:
y_true

In [None]:
len(y_true), len(preds)

In [None]:
np.sqrt(metrics.mean_squared_error(y_true, preds))

In [None]:
pickle.dump(model, open("data/model.pickle", "wb"))