In [107]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

In [108]:
# read in training dataframe
train = pd.read_csv('datasets/train.csv')
# display first 5 rows to evaluate
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [109]:
# clean column names with no spaces and all lower case
train.columns = train.columns.str.lower().str.replace(' ', '_')

In [110]:
def custom_pipe(df):
    # set X or independent variables
    X = df[['overall_cond', 
                'overall_qual', 
                'exter_qual',
                'exter_cond',
                'garage_finish',
                'garage_cond',
                'garage_area', 
                'bsmt_qual',
                'bsmt_cond',
                'total_bsmt_sf', 
                'bsmt_exposure', 
                'kitchen_qual', 
                'fireplace_qu', 
                'year_built',
                'year_remod/add',
                'full_bath',
                'bedroom_abvgr',
                'totrms_abvgrd',
                'garage_qual'
               ]]
    X.fillna(0, inplace=True)
    # set up a dict for mapping of ordinal data columns
    ordinal_dict = {
        'Ex' : 5,
        'Gd' : 4,
        'TA' : 3,
        'Fa' : 2,
        'Po' : 1,
        'NA' : 0,
        0    : 0}
    garage_finish_dct = {
        'Fin':3,
        'RFn':2,
        'Unf':1,
        'NA' :0,
        0    :0}
    bmt_exposure_dct = {
        'Gd':4,
        'Av':3,
        'Mn':2,
        'No':1,
        'NA':0,
        0   :0}
    
    #casting values based on the dictionary above 
    X['bsmt_cond'] = X['bsmt_cond'].map(ordinal_dict)
    X['bsmt_qual'] = X['bsmt_qual'].map(ordinal_dict)
    X['bsmt_exposure'] = X['bsmt_exposure'].map(bmt_exposure_dct)
    X['exter_cond'] = X['exter_cond'].map(ordinal_dict)
    X['exter_qual'] = X['exter_qual'].map(ordinal_dict)
    X['garage_qual'] = X['garage_qual'].map(ordinal_dict)
    X['garage_cond'] = X['garage_cond'].map(ordinal_dict)
    X['kitchen_qual'] = X['kitchen_qual'].map(ordinal_dict)
    X['garage_finish'] = X['garage_finish'].map(garage_finish_dct)
    X['fireplace_qu'] = X['fireplace_qu'].map(ordinal_dict)

    #feature extraction 
    X['year_built'] = 2010 - X['year_built']
    X['year_remod/add'] = 2010 - X['year_remod/add']
    
    #feature interactions
    X['bsmt_expos_qual'] = X['bsmt_exposure'] * X['total_bsmt_sf']
    X['total_garage']=X['garage_cond']* X['garage_area'] * X['garage_qual']
    X['total_basement'] = X['bsmt_qual'] * X['total_bsmt_sf']
    X['bedbathrmkitchen'] = X['full_bath'] * X['bedroom_abvgr'] * X['totrms_abvgrd'] 
    X['overalls'] = X['overall_cond'] * X['overall_qual']
    X['exter_overall'] = X['exter_cond'] * X['exter_qual']
    X['ages'] = X['year_built'] * X['year_remod/add']
    
    try:
        y = df['saleprice']
    except:
        y = 0
    return(X,y)

In [111]:
X,y = custom_pipe(train)

In [112]:
X_train,X_test,y_train,y_test = train_test_split(X, y)

In [113]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [114]:
linreg=LinearRegression()

In [115]:
linreg.fit(X_train_sc, y_train)

LinearRegression()

In [116]:
linreg.score(X_train_sc,y_train)

0.8179728029468467

In [117]:
linreg.score(X_test_sc,y_test)

0.8523760271423503

In [118]:
print(f"Training R2: {linreg.score(X_train_sc, y_train)}")
print(f"Testing R2: {linreg.score(X_test_sc, y_test)}")
print(f"Cross val R2: {cross_val_score(linreg, X, y).mean()}")

Training R2: 0.8179728029468467
Testing R2: 0.8523760271423503
Cross val R2: 0.8103234389232601


In [119]:
y_preds = linreg.predict(X_train)

In [120]:
mrse = metrics.mean_squared_error(y_true=y_train, y_pred=y_preds)**0.5
mrse

176084574.7390033

In [121]:
test_df = pd.read_csv('datasets/test.csv')

In [122]:
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [123]:
# clean column names with no spaces and all lower case
test_df.columns = test_df.columns.str.lower().str.replace(' ', '_')

In [124]:
X,y = custom_pipe(test_df)

In [125]:
linreg.predict(X)

array([ 4.29544972e+07,  3.10965314e+08,  1.08083037e+08,  1.17431122e+08,
        2.23237299e+08,  7.10219745e+07,  1.07044353e+08,  1.58482582e+08,
        1.61761027e+08,  1.49282780e+08,  2.02976170e+08,  1.19969173e+08,
        1.56469996e+08,  2.74684514e+08,  9.02699792e+07,  2.08089127e+07,
        1.39060036e+08,  1.20291325e+08,  1.56387578e+08,  1.12370125e+08,
        1.08717134e+08,  1.34506114e+08,  2.59297140e+08,  9.33874780e+07,
        2.11821474e+08,  8.52892664e+07,  9.79516660e+07,  3.21763967e+07,
        8.69573585e+07,  2.27855027e+07,  1.32683125e+08,  1.01729070e+08,
        1.85808641e+08,  1.93573284e+08,  2.33014789e+08,  1.20101846e+08,
        1.06593401e+08,  1.37791120e+07,  3.94439742e+07,  2.31937364e+08,
        1.11314074e+08,  1.61949080e+08,  1.50864743e+08,  1.17440364e+08,
        2.06149452e+08,  5.19609999e+07,  1.63206154e+08,  9.69509813e+07,
        1.12354898e+08,  1.16945335e+08,  7.72147032e+07,  6.85160184e+07,
        2.49559552e+08,  

In [126]:
test_df['SalePrice'] = linreg.predict(X)

In [127]:
test_df[['id', 'SalePrice']].to_csv('datasets/preds.csv')