In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/benvictoria17/MachineLearning/master/dataset/University%20Salaries/salaries_final.csv")

In [3]:
data

Unnamed: 0,Year,Name,Primary Job Title,Base Pay,Department,College
0,2010,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,"Abaied, Jamie L.",Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,"Abaied, Jamie L.",Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,"Abaied, Jamie L.",Assistant Professor,68658.0,Department of Psychological Science,CAS
...,...,...,...,...,...,...
14465,2016,"van der Vliet, Albert",Professor,163635.0,Department of Pathology&Laboratory Medicine,COM
14466,2017,"van der Vliet, Albert",Professor,175294.0,Department of Pathology&Laboratory Medicine,COM
14467,2018,"van der Vliet, Albert",Professor,191000.0,Department of Pathology&Laboratory Medicine,COM
14468,2019,"van der Vliet, Albert",Professor,196000.0,Department of Pathology&Laboratory Medicine,COM


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14470 entries, 0 to 14469
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               14470 non-null  int64  
 1   Name               14470 non-null  object 
 2   Primary Job Title  14470 non-null  object 
 3   Base Pay           14470 non-null  float64
 4   Department         14470 non-null  object 
 5   College            14470 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 678.4+ KB


In [5]:
def preprocess_inputs(df):
    df = df.copy()
    
    df = df.drop('Name', axis=1)
    
    df = df.sample(frac=1.0).reset_index(drop=True)
    
    y = df['Base Pay']
    X = df.drop('Base Pay', axis=1)
    
    return X, y
X, y = preprocess_inputs(data)
X

Unnamed: 0,Year,Primary Job Title,Department,College
0,2010,Lecturer I,Department of Physics,CAS
1,2012,Office/Prgm Support Generalist,Department of Med-Cardiology,COM
2,2020,Associate Professor,Department of Radiology,COM
3,2019,Professor,Department of Psychiatry,COM
4,2011,Assistant Professor,Department of Family Medicine,COM
...,...,...,...,...
14465,2011,Research Assistant Prof,Department of Biochemistry,CAS
14466,2019,Associate Professor,Department of Biology,CAS
14467,2012,Associate Professor,Department of Radiation-Oncology,COM
14468,2016,Researcher/Analyst,Department of COM Microbio & Molec Genetics,COM


In [9]:
y

In [7]:
def build_pipeline(regressor):
    
    nominal_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('nominal', nominal_transformer, ['Primary Job Title', 'Department', 'College'])
    ], remainder='passthrough')
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', regressor)
    ])
    
    return model
models = {
    "Linear Regression (Ridge)": build_pipeline(Ridge()),
    "            Decision Tree": build_pipeline(DecisionTreeRegressor()),
    "           Neural Network": build_pipeline(MLPRegressor()),
    "            Random Forest": build_pipeline(RandomForestRegressor()),
    "        Gradient Boosting": build_pipeline(GradientBoostingRegressor())
}

In [8]:
def evaluate_model(model, X, y):
    
    kf = KFold(n_splits=5)
    rmses = []
    r2s = []
    
    for train_idx, test_idx in kf.split(X):
        model.fit(X.iloc[train_idx, :], y.iloc[train_idx])
        
        pred = model.predict(X.iloc[test_idx, :])
        
        rmse = np.sqrt(np.mean((y.iloc[test_idx] - pred)**2))
        rmses.append(rmse)
        
        r2 = 1 - (np.sum((y.iloc[test_idx] - pred)**2) / np.sum((y.iloc[test_idx] - y.iloc[test_idx].mean())**2))
        r2s.append(r2)
        
    return np.mean(rmses), np.mean(r2s)
for name, model in models.items():
    print(name + " RMSE: {:.2f}".format(evaluate_model(model, X, y)[0]))

Linear Regression (Ridge) RMSE: 28441.42
            Decision Tree RMSE: 30055.08
           Neural Network RMSE: 40190.27
            Random Forest RMSE: 28792.14
        Gradient Boosting RMSE: 31634.76


In [10]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(evaluate_model(model, X, y)[1]))

Linear Regression (Ridge) R^2: 0.63796
            Decision Tree R^2: 0.59514
           Neural Network R^2: 0.56919
            Random Forest R^2: 0.62752
        Gradient Boosting R^2: 0.55209
