In [1]:
__author__ = 'John Brugman'
__date__ = 'August 18 2020'
__website__ = 'www.johnbrugman.com'

### Import Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle

### Define Data Class

In [3]:
class Data:
    def __init__(self, train_feature_file, train_target_file, test_file, cat_cols, num_cols, target_col, id_col):
        '''creates train and test dataframes'''
        self.cat_cols = list(cat_cols)
        self.num_cols = list(num_cols)
        self.feature_cols = num_cols
        self.target_col = target_col
        self.id_col = id_col
        self.label_encoders = {}
        self.train_df = self._create_train(train_feature_file, train_target_file)
        self.test_df = self._create_test(test_file)
        
    def label_encoder(self, df, cols):
        '''encodes Labels'''
        pass
    
    def inverse_encode_df(self, df, cols):
        pass
    
    def _create_train(self, train_feature_file, train_target_file):
        '''creates train dataframe from train feature and target files'''
        train_feature_df = pd.read_csv(train_feature_file)
        train_target_df = pd.read_csv(train_target_file)
        train_df = pd.merge(train_feature_df, train_target_df)
        return train_df
    
    def _create_test(self, test_file):
        '''creates test dataframe from test_file'''
        test_df = pd.read_csv(test_file)
        return test_df
    

### Define ModelContainer Class

In [12]:
class ModelContainer:
    def __init__(self, models = []):
        '''initializes models and dictionaries'''
        self.models = models
        self.best_model = None
        self.predictions = None
        self.mean_mse = {}
        
    def add_model(self, model):
        '''adds model to list of models'''
        self.models.append(model)
        
    def cross_validate_model(self, data, k=3, num_procs=1):
        '''calculates mse for models'''
        feature_df = data.train_df[data.feature_cols]
        target_df = data.train_df[data.target_col]
        for model in self.models:
            neg_mse = cross_val_score(model, X=feature_df, y=target_df, scoring='neg_mean_squared_error', cv = k, n_jobs=num_procs)
            self.mean_mse[model] = np.mean(neg_mse) * -1.0
        
        
    def select_best_model(self):
        '''selects the best model based off mse'''
        self.best_model = min(self.mean_mse, key=self.mean_mse.get)
    
    def best_model_fit(self, features, targets):
        '''fits best model'''
        self.best_model.fit(features, targets)
            
    def best_model_predict(self, features):
        '''makes predictions with best model'''
        self.predictions = self.best_model.predict(features)
        
    def save_results(self):
        '''saves model and results'''
        pass
    
    def print_summary(self):
        print('The best model was', models.best_model)
        print(f'This model had an mse of {models.mean_mse[models.best_model]}')
        

#### Define Parameters

In [5]:
# Input Files
train_feature_file = 'data/train_features.csv'
train_target_file = 'data/train_salaries.csv'
test_file = 'data/test_features.csv'

# Feature Columns
cat_cols = ['companyId', 'jobType', 'degree', 'major', 'industry']
num_cols = ['yearsExperience', 'milesFromMetropolis']
target_col = "salary"
id_col = "jobId"

# Parameters Needed
num_procs = 4
verbose_lvl = 0

#### Create Data Object

In [6]:
data = Data(train_feature_file, train_target_file, test_file, cat_cols, num_cols, target_col, id_col)

### Creating modelcontainer and adding models

In [7]:
# Create ModelContainer
models = ModelContainer()

# Adding Models to ModelContainer
models.add_model(LinearRegression())
models.add_model(RandomForestRegressor(n_estimators = 60, n_jobs=num_procs, max_depth=15, \
                                       min_samples_split=80, verbose=verbose_lvl))
models.add_model(GradientBoostingRegressor(n_estimators=40, max_depth=7, loss='ls', verbose=verbose_lvl))

### Cross Validate Models, Selecting best_model and fitting/predicting

In [9]:
models.cross_validate_model(data, k=2, num_procs=num_procs)
models.select_best_model()

In [10]:
# Fitting and predicting with best_model
models.best_model_fit(data.train_df[data.feature_cols], data.train_df[data.target_col])
models.best_model_predict(data.test_df[data.feature_cols])

## Summarizing Results

In [13]:
models.print_summary()

The best model was  GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=7,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=40,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=7,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
             