In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [33]:
housing_data_initial = pd.read_csv('datasets/train.csv')
test_data = pd.read_csv('datasets/test.csv')
housing_data_initial.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
housing_data_initial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
Id                 2051 non-null int64
PID                2051 non-null int64
MS SubClass        2051 non-null int64
MS Zoning          2051 non-null object
Lot Frontage       1721 non-null float64
Lot Area           2051 non-null int64
Street             2051 non-null object
Alley              140 non-null object
Lot Shape          2051 non-null object
Land Contour       2051 non-null object
Utilities          2051 non-null object
Lot Config         2051 non-null object
Land Slope         2051 non-null object
Neighborhood       2051 non-null object
Condition 1        2051 non-null object
Condition 2        2051 non-null object
Bldg Type          2051 non-null object
House Style        2051 non-null object
Overall Qual       2051 non-null int64
Overall Cond       2051 non-null int64
Year Built         2051 non-null int64
Year Remod/Add     2051 non-null int64
Roof Style         20

In [8]:
housing_data_initial.drop(['Lot Frontage', 'Alley', 'Pool QC', 'Fence', 'Misc Feature'], axis=1, inplace=True)

In [10]:
X = housing_data_initial.drop('SalePrice',axis=1)
y = housing_data_initial['SalePrice']

In [12]:
X.fillna(0, inplace=True)

In [15]:
y.head()

0    130500
1    220000
2    109000
3    174000
4    138500
Name: SalePrice, dtype: int64

In [19]:
X_numerical = X.select_dtypes(['number'])

In [20]:
knn_pipe = Pipeline([
    ('ss' , StandardScaler()),
    ('knn' , KNeighborsClassifier())
])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_numerical, y, test_size=.2,)

In [22]:
knn_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [24]:
knn_pipe.score(X_train, y_train)

0.17865853658536585

In [26]:
knn_pipe_params = {
    'knn__leaf_size' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'knn__n_neighbors' : [1,2,3,4,5,6,7,8,9,10],
    'knn__weights' : ['uniform', 'distance'],
    'knn__p' : [1,2,3]
}

In [27]:
knn_pipe_gridsearch = GridSearchCV(knn_pipe,
                                   knn_pipe_params,
                                   cv=5,
                                   verbose=1)

In [29]:
knn_pipe_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3000 out of 3000 | elapsed:  7.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             ii

In [31]:
best_knn = knn_pipe_gridsearch.best_estimator_

In [32]:
best_knn.score(X_test, y_test)

0.014598540145985401

In [38]:
test_input = test_data.drop(['Lot Frontage', 'Alley', 'Pool QC', 'Fence', 'Misc Feature'], axis=1).select_dtypes(['number']).fillna(0)

In [40]:
predict = best_knn.predict(test_input)

In [42]:
predictions_df = pd.DataFrame({
    'Id':test_input['Id'],
    'SalePrice': predict
})
predictions_df.to_csv('test5_predictions.csv', index=False)

In [91]:
lasso_pipe = Pipeline([
    ('pn', PolynomialFeatures()),
    ('sc', StandardScaler()),
    ('ls', LassoCV())
])


In [93]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y)

In [96]:
lasso_pipe.fit(X_train, y_train)

  tol, rng, random, positive)


Pipeline(memory=None,
         steps=[('pn',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('sc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ls',
                 LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001,
                         fit_intercept=True, max_iter=1000, n_alphas=100,
                         n_jobs=None, normalize=False, positive=False,
                         precompute='auto', random_state=None,
                         selection='cyclic', tol=0.0001, verbose=False))],
         verbose=False)

In [97]:
lasso_pipe.score(X_test, y_test)

0.8575912622588329

In [87]:
best_lasso = lasso_gridsearch.best_estimator_

In [89]:
best_lasso.score(X_test1, y_test1)

ValueError: The number of features in X is different to the number of features of the fitted data. The fitted data had 24 features and the X has 75 features.

In [98]:
lasso1_predict = lasso_pipe.predict(test_input)

In [100]:
predictions_df = pd.DataFrame({
    'Id':test_input['Id'],
    'SalePrice': lasso1_predict
})
predictions_df.to_csv('test9_predictions.csv', index=False)