### ***Import All Required Libraries***  

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso, LassoCV

### ***Load Train and Test Data***  

In [97]:
# Lets bring in trained df for the 'SalePrice'
# train_df = pd.read_csv('../project_2-master/data/train.csv')
train_df = pd.read_csv('../data/train.csv')

In [98]:
# For the baseline score
test_df = pd.read_csv('../data/test.csv')

### ***Load Merged Train and Merged Test Data***  

In [99]:
# Lets read in the cleaned trained df
merged_train = pd.read_csv('../data/merged_train.csv')
# Lets read in the cleaned test df
merged_test = pd.read_csv('../data/merged_test.csv')

In [100]:
# SET 'Id' AS THE INDEX
merged_train.set_index('Id', inplace=True)
merged_test.set_index('Id', inplace=True)

### ***Baseline Score***  

In [558]:
# CREATING X AND y
X = merged_train
y = train_df['SalePrice']


# TRAIN, TEST, SPLIT
# train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = .70)

# BASELINE MODEL TO BEAT 
y_train_baseline = [y_train.mean()]*len(y_train)  
y_test_baseline  = [y_train.mean()]*len(y_test)

print('These are the scores to beat:')
print(f"Baseline RMSE - Train: {mean_squared_error(y_train, y_train_baseline)**0.5}")
print(f"Baseline RMSE - Test: {mean_squared_error(y_test, y_test_baseline)**0.5}")

These are the scores to beat:
Baseline RMSE - Train: 80039.93732744697
Baseline RMSE - Test: 77354.3252026887


### ***Investment Linear Regression***  

In [103]:
def linear_regression(train, test, saleprice):

# CREATING X AND y
    X = train
    y = saleprice['SalePrice']
    

# TRAIN, TEST, SPLIT
    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = .70)
    
    
# BACK TO OUR REAL MODEL
    #  Instantiate the model
    lr = LinearRegression()
    
    # Fit the model
    lr.fit(X_train, y_train)
    
    # Predict the model
    y_pred_train = lr.predict(X_train)
    y_pred_test = lr.predict(X_test)
    
    
    # Get the RMSE and R2
    print(f"The RMSE for X_train is {mean_squared_error(y_train, y_pred_train, squared=False)}")
    print(f"The RMSE for X_test is {mean_squared_error(y_test, y_pred_test, squared=False)}")
    print('='*60)
    print(f"The R2 for X_train is {lr.score(X_train, y_train)}")
    print(f"The R2 for X_test is {lr.score(X_test, y_test)}")

    
# KAGGLE SUBMISSION STEPS

    # Set my merged_test to kaggle
    kaggle_predict = merged_test
    
    # Predict 'SalePrice' with our model
    kaggle_predict['SalePrice'] = lr.predict(kaggle_predict)
        
    output = kaggle_predict[['SalePrice']]
    
    output.to_csv('../data/new_submission.csv')
    
    return output

In [104]:
linear_regression(merged_train, merged_test, train_df)

The RMSE for X_train is 28091.213742019958
The RMSE for X_test is 28408.522507882135
The R2 for X_train is 0.8768235938749571
The R2 for X_test is 0.8649846910579715


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 41 is different from 42)

### ***Ridge Regression***  

In [70]:
def ridge_regression(train, test, saleprice):

# CREATING X AND y
    X = train 
    y = saleprice['SalePrice']

# TRAIN, TEST, SPLIT
    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = .70)
    
    
# STANDARD SCALER
    # Relabeling scaled data as "Z" is common.
    sc = StandardScaler()
    Z_train = sc.fit_transform(X_train)
    Z_test = sc.transform(X_test)


    # Instantiate.
    ridge_model = Ridge(alpha=10)

    # Fit.
    ridge_model.fit(Z_train, y_train)

    # Evaluate model using R2.
    print(ridge_model.score(Z_train, y_train))
    print(ridge_model.score(Z_test, y_test))


# KAGGLE SUBMISSION STEPS

    # Set my merged_test to kaggle
    kaggle_predict = sc.transform(merged_test)
    
    # Predict 'SalePrice' with our model
    merged_test['SalePrice'] = ridge_model.predict(kaggle_predict)
    
    output = merged_test[['SalePrice']]
    
    output.to_csv('../data/ridge_submission.csv')
    
    return output

In [71]:
ridge_regression(merged_train, merged_test, train_df)

0.8726016016060045
0.8665882068256449


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,138631.788525
2718,194434.925504
2414,191585.854340
1989,117213.745922
625,173131.096506
...,...
1662,175737.853329
1234,200463.739456
1373,137416.187101
1672,115707.872640


### ***Lasso Regression***  

In [105]:
# 4.02-lesson-regularization
# CREATING X AND y
X = merged_train 
y = train_df['SalePrice']

In [106]:
# TRAIN, TEST, SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = .70)

In [107]:
# STANDARD SCALER
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [108]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=5000)

# Fit model using best ridge alpha!
lasso_cv.fit(Z_train, y_train);

In [109]:
# Here is the optimal value of alpha
lasso_cv.alpha_

1.0

In [110]:
print(lasso_cv.score(Z_train, y_train))
print(lasso_cv.score(Z_test, y_test))

0.8768234647262396
0.8650073225030833
