# Find best alpha by looping through random states

In [1]:
# Necessary imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, lars_path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

sns.set(context='notebook', style='whitegrid', font_scale=1.2)

## Data Import and Setup

In [2]:
sf = pd.read_csv('data_files/sf_clean.csv')

# Reformatting hood_district names
sf['hood_district'] = sf['hood_district'].astype(int)
sf['hood_district'] = sf['hood_district'].astype(str)

In [3]:
sf = pd.get_dummies(sf)

In [4]:
cols_to_drop = ['laundry_(c) no laundry',
               'pets_(d) no pets', 
               'housing_type_(c) multi',
               'parking_(d) no parking',
               'hood_district_10']

sf = sf.drop(cols_to_drop, axis=1)

In [5]:
sf.head()

Unnamed: 0,price,sqft,beds,bath,laundry_(a) in-unit,laundry_(b) on-site,pets_(a) both,pets_(b) dogs,pets_(c) cats,housing_type_(a) single,...,parking_(c) off-street,hood_district_1,hood_district_2,hood_district_3,hood_district_4,hood_district_5,hood_district_6,hood_district_7,hood_district_8,hood_district_9
0,6800,1600.0,2.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3500,550.0,1.0,1.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,5100,1300.0,2.0,1.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,9000,3500.0,3.0,2.5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3100,561.0,1.0,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Function to evaluate alphas

In [6]:
def find_best_alpha(rando_state_list, alphavec, X, y, model_type):
    '''
    Function to evalaute alphas at different random states for lasso or ridge models
    
    Inputs:  * list of random states
             * vector of possible alphas to evaluate
             * X - independent variables
             * y - target variable
             * model type: 'ridge' or 'lasso'
             
    Returns: For each random state
                * Best alpha for the given random state
                * Trains model using best alpha and returns: 
                    * Training r^2 value
                    * Validation r^2 value
                    * Training/Validation r^2 ratio
    '''
    
    print("Model Type: ", model_type)
    print("Alpha evaluation in progress...\n")    
    
    for r_state in rando_state_list:
        
        X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=r_state+1)
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.20, random_state=r_state+2)
        
        # Scalilng features
        scaler = StandardScaler()

        X_train_scaled = scaler.fit_transform(X_train.values)
        X_val_scaled = scaler.transform(X_val.values)
        X_test_scaled = scaler.transform(X_test.values)
            
        
        if model_type == 'lasso':
        
            # Lasso Model Evaluation
            lasso_model = LassoCV(alphas = alphavec, cv=5)
            lasso_model.fit(X_train_scaled, y_train)
            
            # Find best alpha 
            best_alpha = lasso_model.alpha_
        
            # Fit Model
            las = Lasso(alpha=best_alpha)

            las.fit(X_train_scaled, y_train)
            train_score = las.score(X_train_scaled, y_train)
            val_score = las.score(X_val_scaled, y_val)
            
        elif model_type == 'ridge':
            
            # Ridge Model evaluation
            # Lasso Model Evaluation
            ridge_model = RidgeCV(alphas = alphavec, cv=5)
            ridge_model.fit(X_train_scaled, y_train)
            
            # Find best alpha 
            best_alpha = ridge_model.alpha_
        
            # Fit Model
            rid = Ridge(alpha=best_alpha)

            rid.fit(X_train_scaled, y_train)
            train_score = rid.score(X_train_scaled, y_train)
            val_score = rid.score(X_val_scaled, y_val)
            
        else:
            print("~~~ No Other Models Supported ~~~")
            break
        
        print("Random State: ", r_state)
        print("~~~~~~~~~~~~~~~~~~")
        print("Best Alpha: ", best_alpha)
        print(f'Lasso Regression train R^2: {train_score:.5f}')
        print(f'Lasso Regression val R^2: {val_score:.5f}')
        print("")
        print(f'Train/Val R^2 Ratio: {train_score/val_score: .5f}')
        print("")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("")
        
    print("Evaluation complete.")    
        


## Model evaluations

In [7]:
X, y = sf.drop('price',axis=1), sf['price']

In [8]:
rando_states = list(range(1, 200, 42))
rando_states

[1, 43, 85, 127, 169]

### Ridge

In [9]:
alphavec = 10**np.linspace(-2,2,500)

In [10]:
find_best_alpha(rando_states, alphavec, X, y, model_type='ridge')

Model Type:  ridge
Alpha evaluation in progress...

Random State:  1
~~~~~~~~~~~~~~~~~~
Best Alpha:  6.1599779603801625
Lasso Regression train R^2: 0.73314
Lasso Regression val R^2: 0.81107

Train/Val R^2 Ratio:  0.90391

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  43
~~~~~~~~~~~~~~~~~~
Best Alpha:  6.75552831533164
Lasso Regression train R^2: 0.74926
Lasso Regression val R^2: 0.74660

Train/Val R^2 Ratio:  1.00356

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  85
~~~~~~~~~~~~~~~~~~
Best Alpha:  9.076446072885357
Lasso Regression train R^2: 0.75687
Lasso Regression val R^2: 0.76281

Train/Val R^2 Ratio:  0.99221

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  127
~~~~~~~~~~~~~~~~~~
Best Alpha:  78.66681790071581
Lasso Regression train R^2: 0.73434
Lasso Regression val R^2: 0.77242

Train/Val R^2 Ratio:  0.95069

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  169
~~~~~~~~~~~~~~~~~~
Best Alpha:  57.480301581253514
Lasso Regression train R^2: 0.73398
Lasso Regressi

### Lasso

In [11]:
alphavec = 10**np.linspace(-2,2,1000)

In [12]:
find_best_alpha(rando_states, alphavec, X, y, model_type='lasso')

Model Type:  lasso
Alpha evaluation in progress...

Random State:  1
~~~~~~~~~~~~~~~~~~
Best Alpha:  0.5317723177850969
Lasso Regression train R^2: 0.73391
Lasso Regression val R^2: 0.81202

Train/Val R^2 Ratio:  0.90381

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  43
~~~~~~~~~~~~~~~~~~
Best Alpha:  0.9593608287093142
Lasso Regression train R^2: 0.75033
Lasso Regression val R^2: 0.74244

Train/Val R^2 Ratio:  1.01062

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  85
~~~~~~~~~~~~~~~~~~
Best Alpha:  9.615746001432095
Lasso Regression train R^2: 0.75431
Lasso Regression val R^2: 0.76344

Train/Val R^2 Ratio:  0.98803

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  127
~~~~~~~~~~~~~~~~~~
Best Alpha:  100.0
Lasso Regression train R^2: 0.72881
Lasso Regression val R^2: 0.75444

Train/Val R^2 Ratio:  0.96603

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  169
~~~~~~~~~~~~~~~~~~
Best Alpha:  13.904108340900697
Lasso Regression train R^2: 0.74748
Lasso Regression val R^2