<a href="https://colab.research.google.com/github/quinn-dougherty/DS-Unit-2-Sprint-2-Linear-Regression/blob/master/module2-polynomial-regression/Copy_of_Polynomial_Log_linear_Regression_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intermediate Linear Regression Practice


## Use a Linear Regression model to get the lowest RMSE possible on the following dataset:

[Dataset Folder](https://github.com/ryanleeallred/datasets/tree/master/Ames%20Housing%20Data)

[Raw CSV](https://raw.githubusercontent.com/ryanleeallred/datasets/master/Ames%20Housing%20Data/train.csv)

## You model must include (at least):
- A log-transformed y variable
- Two polynomial features
- One interaction feature
- 10 other engineered features

What is the lowest Root-Mean-Squared Error that you are able to obtain? Share your best RMSEs in Slack!

In [14]:
import pandas as pd
import seaborn as sns
import altair as alt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


url = "https://raw.githubusercontent.com/ryanleeallred/datasets/master/Ames%20Housing%20Data/train.csv"

df = pd.read_csv(url)

print(df.shape)
docs = ''''''
df.head()

# df.isna().sum()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [0]:

from sklearn.preprocessing import LabelEncoder

#Auto encodes any dataframe column of type category or object.
def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df.getDummies()

In [0]:
def LR(X, y, testsize=0.3): 
  #print(X.shape, y.shape)
  # X is a dataframe with arbitrary features
  # y is a dataframe with one feature
  # they are each sliced from the same master df. 
  
  # Split into test and train datasets
  X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=testsize, random_state=42)

  # fit model using train datasets
  model = LinearRegression()
  model.fit(X_train, Y_train)

  # Create new predictions using x_test
  y_pred = model.predict(X_test)

  # Measure Accuracy using y_test and y_pred
  RMSE = (np.sqrt(mean_squared_error(Y_test, y_pred)))
  R2 = r2_score(Y_test, y_pred)

  print('RMSE is {}'.format(RMSE))
  print('R^2 is {}'.format(R2))

  print("coefficients: ", model.coef_)
  print("intercepts: ", model.intercept_)
  
  return {'RMSE': RMSE, 'R2': R2, 'coefficients': model.coef_, 'intercept': model.intercept_}

In [140]:
df2 = df.select_dtypes(include=['float', 'int']).dropna(axis='columns')

y = df2.SalePrice

X = df2.drop('SalePrice', axis=1)

each_numeric_feature = LR(X,y)

RMSE is 35803.233385786545
R^2 is 0.8163006166451179
coefficients:  [-2.27069494e+00 -2.01032862e+02  3.95140782e-01  1.88008225e+04
  2.93995375e+03  3.21625450e+02  1.80848171e+02  9.08878872e+00
 -5.36329103e+00 -6.27105990e-01  3.09839170e+00  1.10658816e+01
  1.26726161e+01  8.16960684e+00  3.19081045e+01  1.19331017e+04
  6.76005166e+01  4.37161486e+03 -1.66529393e+03 -9.60549983e+03
 -9.47311483e+03  5.34326868e+03  4.80848587e+03  1.46598382e+04
 -5.36677168e+00  2.27111035e+01 -1.84670026e+01  1.36407856e-01
  4.37006963e+01  7.28167496e+01 -2.87978103e+01 -2.08178895e+00
 -3.64068313e+01 -1.43115696e+02]
intercepts:  -757306.5937962856


In [145]:
df2['ln_price'] = np.log(df2.SalePrice)
df2['Total_bath'] = df2.BsmtFullBath + 0.5 * df2.BsmtHalfBath + df2.FullBath + 0.5 * df2.HalfBath
dropbath = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'] 
## according to docs, Fullbath and Halfbath only count above ground. 

# lets make a total square footage, then decide that LowQualFinSF is worth less by a weighting. 
# similarly, that exterior (porches) matter, but not as much as interior by a weighting
dropSF = ['1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'SF_aboveground_interior']
df2['SF_aboveground_interior'] = df2['1stFlrSF'] + df2['2ndFlrSF']
df2['weighted_SF_aboveground_interior'] = df2['SF_aboveground_interior'] - 0.2 * df2.LowQualFinSF

drop_porch = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
# arbitrarily assigning weights based on which of these sound bougier
df2['porch_weighted_area'] = np.dot([0.23, 0.25, 0.6, 0.65, 0.3], list(map(lambda feat: df2[feat], drop_porch)))

#df2['porch_total_area_1'] = 0.2 * df2.WoodDeckSF + 0.25 * df2.OpenPorchSF + 0.6 * df2.EnclosedPorch + 0.55 * df2['3SsnPorch'] + 0.2 * df2.ScreenPorch

from numpy.testing import assert_almost_equal
assert_almost_equal(np.dot([0.2, 0.25, 0.6, 0.55, 0.2], list(map(lambda feat: df2[feat], drop_porch))), 0.2 * df2.WoodDeckSF + 0.25 * df2.OpenPorchSF + 0.6 * df2.EnclosedPorch + 0.55 * df2['3SsnPorch'] + 0.2 * df2.ScreenPorch)

df2['SF_aboveground_weighted'] = df2.weighted_SF_aboveground_interior + df2.porch_weighted_area + 0.78 * df2.PoolArea + 0.845 * df2.GarageArea

outdoor = drop_porch + ['PoolArea', 'GarageArea']

# lets decide to care about condition as it relates to time
time_and_qual = ['OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'YrSold']
df2['complicated_quality'] = (np.divide(df2.OverallQual * df2.OverallCond, 
                                       (0.7*(df2.YrSold - df2.YearRemodAdd) + 0.3*(df2.YrSold - df2.YearBuilt))**2 + df2.YearBuilt))

df2['polynomial_overall'] = np.divide(np.sqrt(df2.OverallQual ** 2 + df2.OverallCond ** 2), 2)

y = df2.ln_price

TO_DROP = ['SalePrice', 'ln_price', 'Id'] + dropbath + dropSF + outdoor + ['LotArea'] + time_and_qual
X = df2.drop(TO_DROP, axis=1)
print(X.shape)

X.complicated_quality
fancy_model = LR(X,y, 0.225)

#X.columns

#df2[time_and_qual]


(1460, 19)
RMSE is 0.15761604807549534
R^2 is 0.857466894311116
coefficients:  [-5.16281498e-04  3.55066952e-05  2.31338877e-05  2.98797085e-05
  8.85203048e-05  7.06505755e-05 -2.92927256e-03 -1.21679847e-01
  1.75813410e-02  6.51435732e-02  1.26592500e-01 -2.89522985e-06
  1.40451666e-03  9.06043528e-02  9.63364919e-05  2.22394904e-04
 -4.42766646e-05  1.71231247e+01  6.75400867e-02]
intercepts:  10.794812254628777


In [0]:
# from scipy import stats
# y[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [156]:
with_single_log = LR(X,y, 0.225)
with_double_log = LR(X,np.log(y), 0.225)


print('difference in RMSE when when applying a second log to log(y): {:.4}'.format(with_single_log['RMSE']-with_double_log['RMSE']))
print('difference in R2 when when applying a second log to log(y): {:.4}'.format(with_single_log['R2']-with_double_log['R2']))

RMSE is 0.15761604807549534
R^2 is 0.857466894311116
coefficients:  [-5.16281498e-04  3.55066952e-05  2.31338877e-05  2.98797085e-05
  8.85203048e-05  7.06505755e-05 -2.92927256e-03 -1.21679847e-01
  1.75813410e-02  6.51435732e-02  1.26592500e-01 -2.89522985e-06
  1.40451666e-03  9.06043528e-02  9.63364919e-05  2.22394904e-04
 -4.42766646e-05  1.71231247e+01  6.75400867e-02]
intercepts:  10.794812254628777
RMSE is 0.0135011274810375
R^2 is 0.8494035153072341
coefficients:  [-4.11413086e-05  2.75197547e-06  2.17021508e-06  2.45203664e-06
  7.37422830e-06  5.55019722e-06  1.37236498e-04 -1.00547664e-02
  1.35488750e-03  5.52226975e-03  1.06844792e-02 -2.20364580e-07
  1.25059258e-04  7.67352317e-03  7.66669844e-06  1.81340114e-05
 -3.64232661e-06  1.41209331e+00  5.85273936e-03]
intercepts:  2.3830870767876693
difference in RMSE when when applying a second log to log(y): 0.1441
difference in R2 when when applying a second log to log(y): 0.008063


In [153]:
LR(X,y, 0.225)['RMSE']

RMSE is 0.15761604807549534
R^2 is 0.857466894311116
coefficients:  [-5.16281498e-04  3.55066952e-05  2.31338877e-05  2.98797085e-05
  8.85203048e-05  7.06505755e-05 -2.92927256e-03 -1.21679847e-01
  1.75813410e-02  6.51435732e-02  1.26592500e-01 -2.89522985e-06
  1.40451666e-03  9.06043528e-02  9.63364919e-05  2.22394904e-04
 -4.42766646e-05  1.71231247e+01  6.75400867e-02]
intercepts:  10.794812254628777


0.15761604807549534

In [0]:
from numpy.testing import assert_almost_equal
assert_almost_equal(np.dot([0.2, 0.25, 0.6, 0.55, 0.2], list(map(lambda feat: df2[feat], drop_porch))), 
                    0.2*df2.WoodDeckSF + 0.25*df2.OpenPorchSF + 0.6*df2.EnclosedPorch + 0.55*df2['3SsnPorch'] + 0.2*df2.ScreenPorch)

In [105]:
(1 + 2 + 
  3 + 4)

10