In [236]:
import numpy as np

import pandas as pd
from pandas.api.types import CategoricalDtype

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Plot settings
plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12


In [237]:
training_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [238]:
display(training_data.shape)
display(test_data.shape)
training_data.head()
training_data.columns

(1460, 81)

(1459, 80)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [239]:
from sklearn import linear_model as lm

linear_model = lm.LinearRegression(fit_intercept=True)
training_data.groupby("SaleType")['SalePrice'].mean().sort_values()

SaleType
Oth      119850.000000
ConLD    138780.888889
ConLw    143700.000000
COD      143973.255814
WD       173401.836622
ConLI    200390.000000
CWD      210600.000000
Con      269600.000000
New      274945.418033
Name: SalePrice, dtype: float64

In [240]:
def remove_outliers(data, variable, lower=-np.inf, upper=np.inf):
    """
    Input:
      data (data frame): the table to be filtered
      variable (string): the column with numerical outliers
      lower (numeric): observations with values lower than this will be removed
      upper (numeric): observations with values higher than this will be removed
    
    Output:
      a winsorized data frame with outliers removed
      
    Note: This function should not change mutate the contents of data.
    """  
    return data[(data[variable]<upper)&(lower<data[variable])]

In [247]:
def add_total_bathrooms(data):
    """
    Input:
      data (data frame): a data frame containing at least 4 numeric columns 
            Bsmt_Full_Bath, Full_Bath, Bsmt_Half_Bath, and Half_Bath
    """
    with_bathrooms = data.copy()
    bath_vars = ['BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath']
    weights = pd.Series([1, 1, 0.5, 0.5], index=bath_vars)
    with_bathrooms = with_bathrooms.fillna(0)
    with_bathrooms['TotalBathrooms'] = (with_bathrooms['BsmtFullBath']
                                        +with_bathrooms['FullBath']) + (0.5*
                                        (with_bathrooms['BsmtHalfBath']+
                                        with_bathrooms['HalfBath']))
    return with_bathrooms

In [248]:
def select_columns(data, *columns):
    """Select only columns passed as arguments."""
    return data.loc[:, columns]

def process_data_gm(data):
    """Process the data for a guided model."""
    #data = remove_outliers(data, 'GrLivArea', upper=5000)
    
    # Transform Data, Select Features
    data = add_total_bathrooms(data)
    data = select_columns(data, 
                          'SalePrice', 
                          'GrLivArea', 
                          'TotalBathrooms',
                          'TotRmsAbvGrd',
                         )
    
    # Return predictors and response variables separately
    X = data.drop(['SalePrice'], axis = 1)
    y = data.loc[:, 'SalePrice']
    return X, y

In [249]:
training_data = remove_outliers(training_data, "GrLivArea",upper=5000)
X_train, Y_train = process_data_gm(training_data)
X_test, y_test = process_data_gm(test_data)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_lowerdim(tup)


In [250]:
linear_model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [251]:
y_fitted = linear_model.predict(X_train)

In [252]:
test_predictions = linear_model.predict(X_test)
test_predictions.size

1459

In [204]:
from datetime import datetime

submission_df = pd.DataFrame({
    "Id": test_data['Id'], 
    "SalePrice": test_predictions,
}, columns=['Id', 'SalePrice'])
display(submission_df)
timestamp = datetime.isoformat(datetime.now()).split(".")[0]
submission_df.to_csv("submission_{}.csv".format(timestamp), index=False)

Unnamed: 0,Id,SalePrice
0,1461,109492.976072
1,1462,148578.567022
2,1463,210670.027171
3,1464,198971.949183
4,1465,180898.471572
5,1466,204626.899218
6,1467,161660.460204
7,1468,183559.438304
8,1469,158835.187159
9,1470,116866.680504
