### Modeling Notebook for Ames, Iowa Housing data. 

### Purpose of notebook is to best predict sale price for homes using multiple features.  

### Question looking to be answered is, "does using data from Ames Iowa, can we construct a model that will be flexible enough to guide home purchases based on specific housing characteristics?"

In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [14]:
train_ames  = pd.read_csv('../train.csv', index_col="Id")
test_kaggle = pd.read_csv('../test.csv', index_col="Id")

train_ames.shape, test_kaggle.shape

((2051, 80), (879, 79))

In [15]:
X = train_ames.drop('SalePrice', axis=1)

y = train_ames['SalePrice']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Basic Cleaning of data

In [17]:
train_ames.columns

Index(['PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish'

In [18]:
def drop_data(data):
    noneed_features = data
    dropped = data[['PID', 'Enclosed Porch', '3Ssn Porch',
              'Screen Porch', 'Pool Area', 
              'Pool QC', 'Fence', 'Misc Feature',
              'Misc Val', 'Mo Sold', 'Kitchen AbvGr',
              'TotRms AbvGrd', 'Bedroom AbvGr', 
              'Low Qual Fin SF', 'Electrical',
              'Heating QC', 'Bsmt Exposure',
              'Bsmt Qual', 'Mas Vnr Area',
              'Condition 2', 'Condition 1',
              'Lot Config', 'Land Slope',
              'Alley', 'Lot Shape', 'Land Contour', 'Fireplace Qu',
              'Lot Frontage', 'Mas Vnr Type'
]]
    noneed_features = data.drop(dropped, axis=1)
    return noneed_features

X_train = drop_data(X_train)
X_test = drop_data(X_test)
test_kaggle = drop_data(test_kaggle)

In [19]:
X_train.shape, X_test.shape, test_kaggle.shape

((1538, 50), (513, 50), (879, 50))

In [20]:
X_train.isnull().sum()

MS SubClass        0
MS Zoning          0
Lot Area           0
Street             0
Utilities          0
Neighborhood       0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Exter Qual         0
Exter Cond         0
Foundation         0
Bsmt Cond         44
BsmtFin Type 1    44
BsmtFin SF 1       1
BsmtFin Type 2    45
BsmtFin SF 2       1
Bsmt Unf SF        1
Total Bsmt SF      1
Heating            0
Central Air        0
1st Flr SF         0
2nd Flr SF         0
Gr Liv Area        0
Bsmt Full Bath     2
Bsmt Half Bath     2
Full Bath          0
Half Bath          0
Kitchen Qual       0
Functional         0
Fireplaces         0
Garage Type       80
Garage Yr Blt     81
Garage Finish     81
Garage Cars        1
Garage Area        1
Garage Qual       81
Garage Cond       81
Paved Drive        0
Wood Deck SF       0
Open Porch SF

In [30]:
def null_fill(data):
    filled_data = data.copy()
    fillna_dict = {'Garage Qual': 'No_garage', 'Garage Cond': 'No_garage', 
     'Garage Finish': 'No_garage','Garage Type': 'No_garage',
     'Garage Yr Blt': 'No_garage', 'Garage Finish': 'No_garage',
     'BsmtFin Type 1': 'No_bsmt', 'BsmtFin SF 1': 'No_bsmt', 
     'BsmtFin Type 2': 'No_bsmt', 'BsmtFin SF 2': 'No_bsmt',
     'Bsmt Unf SF': 'No_bsmt', 'Bsmt Cond': 'No_bsmt'
    }
    filled_data = filled_data.fillna(fillna_dict)
    filled_data = filled_data.fillna(0)
    return filled_data

In [31]:
X_train = null_fill(X_train)
X_test = null_fill(X_test)
test_kaggle = null_fill(test_kaggle)

In [32]:
X_train.shape, X_test.shape, test_kaggle.shape

((1538, 50), (513, 50), (879, 50))

In [34]:
string_cols = X_train.select_dtypes(exclude=[np.number]).columns

In [35]:
X_train = pd.get_dummies(X_train, columns=string_cols)
X_test = pd.get_dummies(X_test, columns=string_cols)
test_kaggle = pd.get_dummies(test_kaggle, columns=string_cols)

In [36]:
X_train.shape, X_test.shape, test_kaggle.shape

((1538, 1956), (513, 1036), (879, 1411))

In [37]:
model_cols = X_train.columns

def add_model_cols(data, model_cols):
    new_data = data.copy()
    for missing_col in [col for col in model_cols if col not in data.columns]:
        new_data[missing_col] = 0
    return new_data

X_test = add_model_cols(X_test, model_cols=model_cols)
test_kaggle = add_model_cols(test_kaggle, model_cols=model_cols)

In [39]:
test_kaggle = test_kaggle[model_cols]
X_test      = X_test[model_cols]

In [40]:
X_train.shape, X_test.shape, test_kaggle.shape

((1538, 1956), (513, 1956), (879, 1956))

In [41]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test  = ss.transform(X_test)
test_kaggle = ss.transform(test_kaggle)

In [42]:
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, f_regression

In [44]:
feature_variances = np.apply_along_axis(np.var, axis=0, arr= X_train)

perc_thresh = np.percentile(feature_variances, 90)
perc_thresh

vt = VarianceThreshold(threshold=perc_thresh)
X_train = vt.fit_transform(X_train)
X_test  = vt.transform(X_test)
test_kaggle   = vt.transform(test_kaggle)
print(X_train.shape[1])

102


In [49]:
# r_alphas = np.logspace(0, 5, 200) # generates 200 points between 10^0, 10^5
# ridge_model = RidgeCV(alphas=r_alphas, store_cv_values=True)
# ridge_model = ridge_model.fit(X_train, y_train)

# ridge_optimal_alpha = ridge_model.alpha_

# alpha = ridge_optimal_alpha
# ridge_model = Ridge(alpha=alpha)  # This replaces LinearRegression
# ridge_cv_mean_mse = -cross_val_score(ridge_model,
#                                   X_train,
#                                   y_train,
#                                   cv=5,
#                                   scoring='neg_mean_squared_error').mean()

# ridge_cv_mean_mse, ridge_cv_mean_mse**(1/2)

(5552420437.281146, 74514.5652693562)

In [65]:
alphas = np.logspace(2,4,20)
las = LassoCV(alphas=alphas, n_jobs=-1)
las.fit(X_train, y_train)
cv_scores = cross_val_score(las, X_train, y_train, cv=3).mean()
best_alpha = las.alpha_

In [51]:
y_pred_ridge = ridge_model.fit(X_train, y_train).predict(test_kaggle)

y_pred_ridge.shape

(879,)

In [None]:
y_pred_lasso = las.fit(X_train, y_train).predict(test_kaggle)

y_pred_lasso.shape

In [55]:
submit = pd.read_csv('../test.csv', index_col="Id")

In [56]:
submit['SalePrice'] = y_pred_ridge

submit = submit[['SalePrice']]

submit.to_csv('Submission13.csv')