In [213]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import plotly.express as px
import time

In [214]:
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
# get training, validation, and testing data
x_train, y_train = train.drop(columns = ["interest_rate"]), train["interest_rate"]
x_val, y_val = val.drop(columns = ["interest_rate"]), val["interest_rate"]
x_train.shape, x_val.shape, y_train.shape, y_val.shape

train.isna().sum()
x_train.shape[1]

48

In [215]:
# Linear Regression
from sklearn.linear_model import LinearRegression
linreg = LinearRegression().fit(x_train, y_train)
pred = linreg.predict(x_val)
mse(pred, y_val)


0.4272442699890523

In [216]:
# Random Forest with no hypterparameter tuning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(x_train, y_train)
pred = regr.predict(x_val)
mse(pred, y_val)

0.5094010171106792

In [231]:
# finding hyperparameters
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state = 42)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

# Create the random search grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, cv = 3, verbose=10, random_state=42, n_jobs = -1)

rf_random.fit(x_train, y_train)



Fitting 3 folds for each of 1 candidates, totalling 3 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
                   n_iter=1, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=10)

In [232]:
optimised_random_forest = rf_random.best_params_

start = time.time()
bestRF = RandomForestRegressor(**optimised_random_forest)
bestRF.fit(x_train, y_train)
pred = bestRF.predict(x_train)
print(mse(pred, y_train))
pred = bestRF.predict(x_val)
mse(pred, y_val)


0.18778800408627808


0.3423353532337942

In [233]:

person1 = x_val.iloc[0].copy()
person1['White'] = 0

person2 = person1.copy()
person2['Black or African American'] = 1

person3 = person1.copy()
person3['Asian'] = 1

person4 = person3 = person1.copy()
person4['Native Hawaiian or Other Pacific Islander'] = 1

person5 = person3 = person1.copy()
person5['2 or more minority races'] = 1

person1['White'] = 1

people = pd.DataFrame([person1, person2, person3, person4, person5])

people.fillna(0, inplace=True)

bestRF.predict(people)

#pred1 = bestRF.predict(person1.values.reshape(1, -1))
#pred2 = bestRF.predict(person2.values.reshape(1, -1))
#pred1, pred2
#person1

array([3.2256404 , 3.21662381, 3.20565427, 3.22533376, 3.20565427])

In [219]:
# Principal Component Analysis
from sklearn.decomposition import PCA
pca = PCA(n_components=3, svd_solver='full')
pca.fit(x_train)
print(sum(pca.explained_variance_ratio_))

0.9998253533715785


In [220]:
x_train_reduced = pca.transform(x_train)
print(x_train_reduced.shape)
PCR = LinearRegression()
PCR.fit(x_train_reduced, y_train)
x_val_reduced = pca.transform(x_val)
pred = PCR.predict(x_val_reduced)
mse(pred, y_val)

(89884, 3)


0.8673074000933698

In [224]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_val)

In [222]:
#Neural net
start = time.time()
from sklearn.neural_network import MLPRegressor

net1 = MLPRegressor(activation='relu',
       hidden_layer_sizes=(48,10,10,10,10), solver='adam', max_iter=1000, learning_rate_init=0.005)
net1.fit(x_train_scaled, y_train)
pred = net1.predict(x_train_scaled)
print(f"time: {time.time() - start}")
mse(pred, y_train)


time: 72.91286015510559


0.29571959874033305

In [226]:
x_val_scaled = scaler.transform(x_val)
x_val_scaled = pd.DataFrame(x_val_scaled)
pred = net1.predict(x_val_scaled)
mse(pred, y_val)

0.34809801900744075

In [112]:
scaler1 = StandardScaler()
# Fit only to the training data
scaler1.fit(x_train_reduced)

#Neural net with pca reduced data
start = time.time()
x_train_reduced = scaler1.transform(x_train_reduced)
x_val_reduced = scaler1.transform(x_val_reduced)
x_train_reduced.shape

(89884, 20)

In [114]:

net2 = MLPRegressor(activation='relu',
       hidden_layer_sizes=(20,10,10,10,10,10,10,10), solver='adam', max_iter=1000, learning_rate_init=0.01)
net2.fit(x_train_reduced, y_train)
pred = net2.predict(x_train_reduced)
print(f"time: {time.time() - start}")
mse(pred, y_train)

time: 80.4990611076355


0.3390581039201234

In [118]:
x_val_reduced.shape, x_train_reduced.shape

((11235, 20), (89884, 20))

In [119]:
pred_val = net2.predict(x_val_reduced)
mse(pred_val, y_val)

0.36374361696130897