# Find best alpha by looping through random states

In [2]:
# Necessary imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

sns.set(context='notebook', style='whitegrid', font_scale=1.2)

## Data Import and Setup

In [3]:
sf = pd.read_csv('data_files/sf_clean.csv')

# Reformatting hood_district names
sf['hood_district'] = sf['hood_district'].astype(int)
sf['hood_district'] = sf['hood_district'].astype(str)

### Feature Selection & Engineering

#### Dropping Pets & Housing Type

In [4]:
sf = sf.drop(['pets', 'housing_type'], axis=1)

#### Dummy Variable Setup

In [5]:
sf = pd.get_dummies(sf)

In [6]:
cols_to_drop = ['laundry_(c) no laundry',
               #'pets_(d) no pets', 
               #'housing_type_(c) multi',
               'parking_(d) no parking',
               'hood_district_10']

sf = sf.drop(cols_to_drop, axis=1)

#### Quadratic Manipulation on Beds and Baths

In [7]:
sf['beds^2'] = sf['beds']**2
sf['bath^2'] = sf['bath']**2

In [8]:
sf.head()

Unnamed: 0,price,sqft,beds,bath,laundry_(a) in-unit,laundry_(b) on-site,parking_(a) valet,parking_(b) protected,parking_(c) off-street,hood_district_1,hood_district_2,hood_district_3,hood_district_4,hood_district_5,hood_district_6,hood_district_7,hood_district_8,hood_district_9,beds^2,bath^2
0,6800,1600.0,2.0,2.0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,4.0,4.0
1,3500,550.0,1.0,1.0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1.0,1.0
2,5100,1300.0,2.0,1.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,4.0,1.0
3,9000,3500.0,3.0,2.5,1,0,0,1,0,0,0,0,0,0,0,1,0,0,9.0,6.25
4,3100,561.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,1.0


## Function to evaluate alphas

In [18]:
def find_best_alpha(rando_state_list, alphavec, X, y, model_type):
    '''
    Function to evalaute alphas at different random states for lasso or ridge models
    
    Inputs:  * list of random states
             * vector of possible alphas to evaluate
             * X - independent variables
             * y - target variable
             * model type: 'ridge' or 'lasso'
             
    Returns: For each random state
                * Best alpha for the given random state
                * Trains model using best alpha and returns: 
                    * Training r^2 value
                    * Validation r^2 value
                    * Training/Validation r^2 ratio
    '''
    
    train_scores = []
    test_scores = []
    alphas = []
    rmse_list = []
    
    print("Model Type: ", model_type)
    print("Alpha evaluation in progress...\n")    
    
    for r_state in rando_state_list:
        
        X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=r_state)
        
        # Scalilng features
        scaler = StandardScaler()

        X_train_scaled = scaler.fit_transform(X.values)
        X_test_scaled = scaler.transform(X_test.values)   
            
        
        if model_type == 'lasso':
        
            # Lasso Model Evaluation
            model = LassoCV(alphas = alphavec, cv=3)
            model.fit(X_train_scaled, y)
            
            # Find best alpha 
            best_alpha = model.alpha_
        
            # Fit Model
            model = Lasso(alpha=best_alpha)

            model.fit(X_train_scaled, y)
            train_score = model.score(X_train_scaled, y)
            test_score = model.score(X_test_scaled, y_test)
            
            
        elif model_type == 'ridge':
            
            # Ridge Model evaluation
            # Lasso Model Evaluation
            model = RidgeCV(alphas = alphavec, cv=3)
            model.fit(X_train_scaled, y)
            
            # Find best alpha 
            best_alpha = model.alpha_
        
            # Fit Model
            model = Ridge(alpha=best_alpha)

            model.fit(X_train_scaled, y)
            train_score = model.score(X_train_scaled, y)
            test_score = model.score(X_test_scaled, y_test)
            
        else:
            print("~~~ No Other Models Supported ~~~")
            break
        
        
        # Add scores & alphas to list
        train_scores.append(train_score)
        test_scores.append(test_score)
        alphas.append(best_alpha)
        
        rmse = mean_squared_error(y_test, model.predict(X_test_scaled), squared=False)
        rmse_list.append(rmse)
        
        print("Random State: ", r_state)
        print("~~~~~~~~~~~~~~~~~~")
        print("Best Alpha: ", best_alpha)
        print(f'Lasso Regression train R^2: {train_score:.5f}')
        print(f'Lasso Regression val R^2: {test_score:.5f}')
        print("")
        print(f'Train/Val R^2 Ratio: {train_score/test_score: .5f}')
        print("")
        print("RMSE : ", rmse)
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("")
        
    print("Evaluation complete.\n")
    print("Summary\n")
    
    # Alphas: 
    print('Alphas: ', alphas)
    print(f'Avg alpha: {np.mean(alphas):.5f} +- {np.std(alphas):.5f}')
    print("")
    
    # Scores: 
    print(f'Mean training r^2: {np.mean(train_scores):.5f} +- {np.std(train_scores):.5f}')
    print(f'Mean test r^2: {np.mean(test_scores):.5f} +- {np.std(test_scores):.5f}')
    print(f'Avg train/test r^2 ratio: {(np.mean(train_scores)/np.mean(test_scores)):.5f}')
    print("")
    print(f'Avg RMSE: {np.mean(rmse_list):.5f}')
    

## Model evaluations

In [11]:
X, y = sf.drop('price',axis=1), sf['price']

In [23]:
rando_states = list(range(1, 600, 42))
rando_states

[1, 43, 85, 127, 169, 211, 253, 295, 337, 379, 421, 463, 505, 547, 589]

### Ridge

In [13]:
alphavec = 10**np.linspace(-2,2,500)

In [19]:
find_best_alpha(rando_states, alphavec, X, y, model_type='ridge')

Model Type:  ridge
Alpha evaluation in progress...

Random State:  1
~~~~~~~~~~~~~~~~~~
Best Alpha:  10.916317341936148
Lasso Regression train R^2: 0.77470
Lasso Regression val R^2: 0.78648

Train/Val R^2 Ratio:  0.98503

RMSE :  733.392495152199
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  43
~~~~~~~~~~~~~~~~~~
Best Alpha:  17.968383907677193
Lasso Regression train R^2: 0.76152
Lasso Regression val R^2: 0.79259

Train/Val R^2 Ratio:  0.96080

RMSE :  723.509671056913
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  85
~~~~~~~~~~~~~~~~~~
Best Alpha:  0.01
Lasso Regression train R^2: 0.79704
Lasso Regression val R^2: 0.66750

Train/Val R^2 Ratio:  1.19406

RMSE :  1117.0973395727324
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  127
~~~~~~~~~~~~~~~~~~
Best Alpha:  10.916317341936148
Lasso Regression train R^2: 0.80421
Lasso Regression val R^2: 0.74619

Train/Val R^2 Ratio:  1.07775

RMSE :  740.3069464490119
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  169
~~~~~~~~

### Lasso

In [20]:
alphavec = 10**np.linspace(-2,2,1000)

In [21]:
find_best_alpha(rando_states, alphavec, X, y, model_type='lasso')

Model Type:  lasso
Alpha evaluation in progress...

Random State:  1
~~~~~~~~~~~~~~~~~~
Best Alpha:  6.064329395408062
Lasso Regression train R^2: 0.77385
Lasso Regression val R^2: 0.78650

Train/Val R^2 Ratio:  0.98393

RMSE :  733.3619223282626
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  43
~~~~~~~~~~~~~~~~~~
Best Alpha:  13.27770829355429
Lasso Regression train R^2: 0.75355
Lasso Regression val R^2: 0.78465

Train/Val R^2 Ratio:  0.96036

RMSE :  737.2347085632913
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  85
~~~~~~~~~~~~~~~~~~
Best Alpha:  0.01
Lasso Regression train R^2: 0.79704
Lasso Regression val R^2: 0.66748

Train/Val R^2 Ratio:  1.19411

RMSE :  1117.1391010094062
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  127
~~~~~~~~~~~~~~~~~~
Best Alpha:  7.850456200204509
Lasso Regression train R^2: 0.80201
Lasso Regression val R^2: 0.73964

Train/Val R^2 Ratio:  1.08432

RMSE :  749.7943698903985
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Random State:  169
~~~~~~~~~