# Find best alpha by looping through random states

In [1]:
# Necessary imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, lars_path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [2]:
sf = pd.read_csv('data_files/sf_clean.csv')

# Reformatting hood_district names
sf['hood_district'] = sf['hood_district'].astype(int)
sf['hood_district'] = sf['hood_district'].astype(str)

In [3]:
sf = pd.get_dummies(sf)

In [4]:
cols_to_drop = ['laundry_(c) no laundry',
               'pets_(d) no pets', 
               'housing_type_(c) multi',
               'parking_(d) no parking',
               'hood_district_10']

sf = sf.drop(cols_to_drop, axis=1)

In [5]:
sf.head()

Unnamed: 0,price,sqft,beds,bath,laundry_(a) in-unit,laundry_(b) on-site,pets_(a) both,pets_(b) dogs,pets_(c) cats,housing_type_(a) single,...,parking_(c) off-street,hood_district_1,hood_district_2,hood_district_3,hood_district_4,hood_district_5,hood_district_6,hood_district_7,hood_district_8,hood_district_9
0,6800,1600.0,2.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3500,550.0,1.0,1.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,5100,1300.0,2.0,1.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,9000,3500.0,3.0,2.5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3100,561.0,1.0,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Function to evaluate alphas

In [6]:
def find_best_alpha(rando_state_list, alphavec, X, y, model_type):
    '''
    Function to evalaute alphas at different random states for lasso or ridge models
    
    Inputs:  * list of random states
             * vector of possible alphas to evaluate
             * X - independent variables
             * y - target variable
             * model type: 'ridge' or 'lasso'
             
    Returns: For each random state
                * Best alpha for the given random state
                * Trains model using best alpha and returns: 
                    * Training r^2 value
                    * Validation r^2 value
                    * Training/Validation r^2 ratio
    '''
    
    print("Model Type: ", model_type)
    print("Alpha evaluation in progress...\n")    
    
    for r_state in rando_state_list:
        
        X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=r_state+1)
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.20, random_state=r_state+2)
        
        # Scalilng features
        scaler = StandardScaler()

        X_train_scaled = scaler.fit_transform(X_train.values)
        X_val_scaled = scaler.transform(X_val.values)
        X_test_scaled = scaler.transform(X_test.values)
            
        
        if model_type == 'lasso':
        
            # Lasso Model Evaluation
            lasso_model = LassoCV(alphas = alphavec, cv=5)
            lasso_model.fit(X_train_scaled, y_train)
            
            # Find best alpha 
            best_alpha = lasso_model.alpha_
        
            # Fit Model
            las = Lasso(alpha=best_alpha)

            las.fit(X_train_scaled, y_train)
            train_score = las.score(X_train_scaled, y_train)
            val_score = las.score(X_val_scaled, y_val)
            
        elif model_type == 'ridge':
            
            # Ridge Model evaluation
            # Lasso Model Evaluation
            ridge_model = RidgeCV(alphas = alphavec, cv=5)
            ridge_model.fit(X_train_scaled, y_train)
            
            # Find best alpha 
            best_alpha = ridge_model.alpha_
        
            # Fit Model
            rid = Ridge(alpha=best_alpha)

            rid.fit(X_train_scaled, y_train)
            train_score = rid.score(X_train_scaled, y_train)
            val_score = rid.score(X_val_scaled, y_val)
            
        else:
            print("~~~ No Other Models Supported ~~~")
            break
        
        print("Random State: ", r_state)
        print("~~~~~~~~~~~~~~~~~~")
        print("Best Alpha: ", best_alpha)
        print(f'Lasso Regression train R^2: {train_score:.5f}')
        print(f'Lasso Regression val R^2: {val_score:.5f}')
        print("")
        print(f'Train/Val R^2 Ratio: {train_score/val_score: .5f}')
        print("")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("")
        
    print("Evaluation complete.")    
        


## Model evaluations

In [7]:
X, y = sf.drop('price',axis=1), sf['price']

### Ridge

In [8]:
rando_states = [6, 11, 14, 27, 32, 37, 42, 400]

alphavec = 10**np.linspace(-2,2,500)

In [9]:
find_best_alpha(rando_states, alphavec, X, y, model_type='ridge')

Model Type:  ridge
Alpha evaluation in progress...

Random State:  6
~~~~~~~~~~~~~~~~~~
Best Alpha:  2.315951492743152
Lasso Regression train R^2: 0.79214
Lasso Regression val R^2: 0.73203

Train/Val R^2 Ratio:  1.08212

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  11
~~~~~~~~~~~~~~~~~~
Best Alpha:  24.591304254680438
Lasso Regression train R^2: 0.77178
Lasso Regression val R^2: 0.77619

Train/Val R^2 Ratio:  0.99433

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  14
~~~~~~~~~~~~~~~~~~
Best Alpha:  11.971713434189656
Lasso Regression train R^2: 0.75764
Lasso Regression val R^2: 0.80055

Train/Val R^2 Ratio:  0.94641

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  27
~~~~~~~~~~~~~~~~~~
Best Alpha:  2.151123430182165
Lasso Regression train R^2: 0.78149
Lasso Regression val R^2: 0.81222

Train/Val R^2 Ratio:  0.96217

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  32
~~~~~~~~~~~~~~~~~~
Best Alpha:  33.65534975507091
Lasso Regression train R^2: 0.75900
Lasso Regressio

In [10]:
rando_states = [6, 11, 14, 27, 32, 37, 42, 400]

alphavec = 10**np.linspace(-2,2,1000)

In [11]:
find_best_alpha(rando_states, alphavec, X, y, model_type='lasso')

Model Type:  lasso
Alpha evaluation in progress...

Random State:  6
~~~~~~~~~~~~~~~~~~
Best Alpha:  1.101645949633657
Lasso Regression train R^2: 0.79215
Lasso Regression val R^2: 0.73216

Train/Val R^2 Ratio:  1.08193

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  11
~~~~~~~~~~~~~~~~~~
Best Alpha:  2.0244465099768036
Lasso Regression train R^2: 0.77691
Lasso Regression val R^2: 0.77291

Train/Val R^2 Ratio:  1.00518

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  14
~~~~~~~~~~~~~~~~~~
Best Alpha:  0.8354528058382867
Lasso Regression train R^2: 0.75973
Lasso Regression val R^2: 0.80255

Train/Val R^2 Ratio:  0.94665

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  27
~~~~~~~~~~~~~~~~~~
Best Alpha:  0.9862658461312821
Lasso Regression train R^2: 0.78163
Lasso Regression val R^2: 0.81254

Train/Val R^2 Ratio:  0.96196

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  32
~~~~~~~~~~~~~~~~~~
Best Alpha:  45.67301270168747
Lasso Regression train R^2: 0.75107
Lasso Regressi