In [1]:
import warnings
warnings.filterwarnings("ignore")



import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(color_codes = True)

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

RANDOM_STATE = 2341

In [2]:
df_dummy = pd.read_csv('./datasets/cleaned_dataframe.csv')

In [5]:
X = df_dummy.drop(columns = ['SalaryUSD','Timestamp', 'Survey Year'])

#Create separate object for target variable

In [6]:
y = df_dummy['SalaryUSD']

#np.mean(y)

#### Train and Test Splits

#train test split to help check model

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                        random_state = RANDOM_STATE)

#### Data Scaling

ss = StandardScaler()
ss.fit(X_train)

X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

#With all our models, we will use RMSE as the regression metric to measure the success of our models. RMSE works well for us because it is given in the units of our target variable and can easily tell us the error in our predictions. Our goal is to minimize the RMSE as much as possible to get it close to 0.

def rmse(model, X, y):
    
    predictions = model.predict(X)
    
    rmse = np.sqrt(mean_squared_error(y, predictions))
    
    return rmse

In [7]:
base = DummyRegressor(strategy="mean")

# Fitting our baseline model: 
base.fit(X_train, y_train)

# Calculating training and testing scores: 

print("Baseline Train RMSE: ", rmse(base, X_train,y_train))
print("Baseline Test RMSE: ", rmse(base, X_test,y_test))

Baseline Train RMSE:  57965.304395185034
Baseline Test RMSE:  57179.03514526216


In [8]:
#Tuning params
tuned_params = {'learning_rate': [0.01, 0.1, 0.2], 
                'n_estimators': [20, 50, 70], 
               'max_depth' : [None, 3, 5, 7],
               'max_features' : ['auto', .25, .5, .75, None],
                'min_samples_split': [5, 10, 15], 
                'min_samples_leaf': [1, 2, 4, 5],
               'alpha': [0.9]}
               
                



# Instantiate model
gradient_model = GridSearchCV(GradientBoostingRegressor(random_state= RANDOM_STATE),
                         param_grid=tuned_params, cv=5, n_jobs=-1)

# Fit model
gradient_model.fit(X_train_sc, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...
                            

In [9]:
gradient_model.best_params_

{'alpha': 0.9,
 'learning_rate': 0.1,
 'max_depth': 3,
 'max_features': 0.25,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 70}

In [10]:
print(f'Gradient Regressor Model Train RMSE: {rmse(gradient_model, X_train_sc, y_train)}')
print(f'Gradient Regressor Model Test RMSE: {rmse(gradient_model, X_test_sc, y_test)}')

Gradient Regressor Model Train RMSE: 44523.41733157255
Gradient Regressor Model Test RMSE: 49570.674646423344


In [11]:
#Tuning params
tuned_params = {'n_estimators': [300, 400, 450, 500],
                'min_samples_split': [5, 10, 15], 
                'min_samples_leaf': [ 2, 4, 5],
               'max_depth' : [None, 3, 5, 7],
                'max_features' : ['auto', .25, .02, .15],
                 'max_samples' : [None, 1, 3, 5, 7]}

# Instantiate model
rf_model = GridSearchCV(RandomForestRegressor(random_state= RANDOM_STATE), tuned_params, cv=5, n_jobs=-1)

# Fit model
rf_model.fit(X_train_sc, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=2341,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [12]:
rf_model.best_params_

{'max_depth': None,
 'max_features': 0.25,
 'max_samples': None,
 'min_samples_leaf': 2,
 'min_samples_split': 15,
 'n_estimators': 450}

In [13]:
print(f'Random Forest Model Train RMSE: {rmse(rf_model, X_train_sc, y_train)}')
print(f'Random Forest Model Test RMSE: {rmse(rf_model, X_test_sc, y_test)}')

Random Forest Model Train RMSE: 42744.2735089099
Random Forest Model Test RMSE: 49264.696703187255
