# Salary Predictions Based on Job Descriptions

In [1]:
#import your libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

#your info here
__author__ = "Albert Ntiri"
__email__ = "albert.ntiri@gmail.com"

In [2]:
train_features = pd.read_csv('data/train_features.csv')
train_salaries = pd.read_csv('data/train_salaries.csv')
test_features = pd.read_csv('data/test_features.csv')

training_combined = pd.merge(train_features, train_salaries, how='inner', on='jobId')

## Define Classes

In [3]:
class process_data():
    def __init__(self, data):
        self.data = data
    
    # Drop rows for 0-salary jobs
    def clean_data(self):
        self.data.drop(index = self.data[self.data.salary == 0].index, axis = 0, inplace = True)
    
    # Convert categorical variables to numeric based on hierarchy
    def label_encode(self, col, new_col, map_dict):
        self.data[new_col] = self.data[col].map(map_dict)
    
    # Convert non-hierarchical categorical variables to dummy variables
    def one_hot_encode(self, col, drop_first=False):
        self.data = pd.get_dummies(self.data, columns=[col], drop_first=drop_first)
        
    # drop single or list of columns
    def drop_columns(self, col):
        self.data.drop(col, axis=1, inplace=True)
    
    # Use function to convert old values in column to new values in new column
    def transform(self, col, new_col, function):
        self.data[new_col] = self.data[col].apply(function)
    
    # Create indicator value
    def convert_to_bool(self, string):
        if string == 'NONE':
            return 0
        else:
            return 1
    
    # Scale numeric variables to 0-1 range
    def normalize(self, col, new_col):
        self.data[new_col] = preprocessing.Normalizer(norm='max').transform([self.data[col]])[0]

In [15]:
class model():
    def __init__(self):
        pass
    
    def build_model(self, data, model_name, model_type, features, test_size=.2):
        features_train, features_test, salary_train, salary_test = train_test_split(data[features], data['salary'], test_size=test_size)

        model = model_type.fit(features_train, salary_train)
        training_r2 = model.score(features_train, salary_train)

        predicted_salary = model.predict(features_test)
        test_r2 = r2_score(y_true=salary_test, y_pred=predicted_salary)
        mse = mean_squared_error(y_true=salary_test, y_pred=predicted_salary)

        model_comparison.loc[model_name] = [training_r2, test_r2, mse]
        
        return model
    
    def predict(self, data, model):
        predictions = model.predict(data)
        return predictions

## Part 3 - DEVELOP

You will cycle through creating features, tuning models, and training/validing models (steps 7-9) until you've reached your efficacy goal

#### Your metric will be MSE and your goal is:
 - <360 for entry-level data science roles
 - <320 for senior data science roles

### ---- 7 Engineer features  ----

In [16]:
#make sure that data is ready for modeling
#create any new features needed to potentially enhance model

In [None]:
training_data = process_data(training_combined)

training_data.clean_data()

job_levels = {
    'JANITOR': 1,
    'JUNIOR': 2,
    'SENIOR': 3,
    'MANAGER': 4,
    'VICE_PRESIDENT': 5,
    'CFO': 6,
    'CTO': 6,
    'CEO': 7
}
training_data.label_encode('jobType', 'jobLevel', job_levels)

degree_levels = {
    'NONE': 0,
    'HIGH_SCHOOL': 1,
    'BACHELORS': 2,
    'MASTERS': 3,
    'DOCTORAL': 4
}
training_data.label_encode('degree', 'degreeLevel', degree_levels)

training_data.transform('major', 'majorInd', training_data.convert_to_bool)

training_data.one_hot_encode('major')

training_data.drop_columns('major_NONE')

training_data.one_hot_encode('industry', drop_first=True)

training_data.normalize('yearsExperience', 'yearsExperience_norm')

training_data.normalize('milesFromMetropolis', 'milesFromMetropolis_norm')

old_columns = ['jobId', 'companyId', 'jobType', 'degree', 'yearsExperience', 'milesFromMetropolis']
training_data.drop_columns(old_columns)

### ---- 8 Create models ----

In [None]:
#create and tune the models that you brainstormed during part 2

In [None]:
# Filter to include columns that will go into the model
model_features = ['jobLevel', 'degreeLevel', 'yearsExperience', 'milesFromMetropolis', 'EDUCATION', 'FINANCE', 'HEALTH', 'OIL', 'SERVICE', 'WEB', 'BIOLOGY', 'BUSINESS', 'CHEMISTRY', 'COMPSCI', 'ENGINEERING', 'LITERATURE', 'MATH', 'PHYSICS']
# model_features = ['jobLevel', 'degreeLevel', 'yearsExperience', 'milesFromMetropolis', 'EDUCATION', 'FINANCE', 'HEALTH', 'OIL', 'SERVICE', 'WEB', 'majorInd']
model_columns = model_features + ['salary']
model_train_full = train_full[model_columns]
model_train_full.head()

In [None]:
# Split data into training and test sets

features_train, features_test, salary_train, salary_test = train_test_split(model_train_full[model_features], model_train_full['salary'], test_size=.2)
features_train

#### Linear Regression

In [None]:
# Fit model on the training data

linearmodel = linear_model.LinearRegression()
linearmodel.fit(features_train, salary_train)
print(linearmodel.coef_)
print(linearmodel.score(features_train, salary_train))

#### Lasso & Ridge Regression

In [None]:
lasso = linear_model.Lasso()
gridl = GridSearchCV(lasso, {'alpha':[.01,.1,.2,.5,1]})
gridl.fit(features_train, salary_train)
gridl.best_params_

In [None]:
lasso = linear_model.Lasso(alpha=.01).fit(features_train, salary_train)
print(lasso.coef_)
print(lasso.score(features_train, salary_train))

In [None]:
ridge = linear_model.Ridge(normalize=True)
gridr = GridSearchCV(ridge, {'alpha':[.01,.1,.2,.5,1]})
gridr.fit(features_train, salary_train)
gridr.best_params_

In [None]:
ridge = linear_model.Ridge(normalize=True, alpha=.01).fit(features_train, salary_train)
print(ridge.coef_)
print(ridge.score(features_train, salary_train))

#### Stochastic Gradient Descent

In [None]:
SGD = linear_model.SGDRegressor(penalty='l1', alpha=.01, max_iter=1000).fit(features_train, salary_train)
print(SGD.coef_)
print(SGD.score(features_train, salary_train))

#### Decision Tree with Gradient Boosting

In [None]:
GB = GradientBoostingRegressor(n_estimators=500, max_depth=6).fit(features_train, salary_train)
print(GB.score(features_train, salary_train))

### ---- 9 Test models ----

In [None]:
#do 5-fold cross validation on models and measure MSE

In [None]:
# Run linear regression model on test data

predicted_salary_test = linearmodel.predict(features_test)
predicted_salary_test

In [None]:
# Calculate R^2 and MSE scores on linear regression model

print(mean_squared_error(y_true=salary_test, y_pred=predicted_salary_test))
print(r2_score(y_true=salary_test, y_pred=predicted_salary_test))

In [None]:
# Combine into 1 cell and wrap into a function for multiple models

def build_model(model_name, model_type, features, test_size=.2):
    features_train, features_test, salary_train, salary_test = train_test_split(train_full[features], train_full['salary'], test_size=test_size)
    
    model = model_type.fit(features_train, salary_train)
    training_r2 = model.score(features_train, salary_train)
    
    predicted_salary = model.predict(features_test)
    test_r2 = r2_score(y_true=salary_test, y_pred=predicted_salary)
    mse = mean_squared_error(y_true=salary_test, y_pred=predicted_salary)
    
    model_comparison.loc[model_name] = [training_r2, test_r2, mse]

In [None]:
# Set up model comparison dataframe and initialize different models

model_comparison = pd.DataFrame(columns=['Training_R2', 'Test_R2', 'Mean_Squared_Error'])
lin_reg = linear_model.LinearRegression(normalize=True)
lasso_reg = linear_model.Lasso(alpha=.01)
ridge_reg = linear_model.Ridge(normalize=True, alpha=.01)
SGD_reg = linear_model.SGDRegressor(penalty='l1', alpha=.01, max_iter=1000)
GB_reg = GradientBoostingRegressor(n_estimators=500, max_depth=6)

In [None]:
# Set up different features combinations as variables

features1_joblevel = ['jobLevel']
features2_degree = ['degreeLevel']
features3_yearsexperience = ['yearsExperience']
features4_milesfrommetropolis = ['milesFromMetropolis']
features5_industries = ['EDUCATION', 'FINANCE', 'HEALTH', 'OIL', 'SERVICE', 'WEB']
features6_majors = ['BIOLOGY', 'BUSINESS', 'CHEMISTRY', 'COMPSCI', 'ENGINEERING', 'LITERATURE', 'MATH', 'PHYSICS']
features7_majorind = ['majorInd']
features8_nomajorind = features1_joblevel + features2_degree + features3_yearsexperience + features4_milesfrommetropolis + features5_industries + features6_majors
features9_all = features8_nomajorind + features7_majorind

In [None]:
# Try different feature combinations on linear regression model

build_model('LIN-REG_joblevel', lin_reg, features=features1_joblevel)
build_model('LIN-REG_degreeLevel', lin_reg, features=features2_degree)
build_model('LIN-REG_yearsExperience', lin_reg, features=features3_yearsexperience)
build_model('LIN-REG_milesFromMetropolis', lin_reg, features=features4_milesfrommetropolis)
build_model('LIN-REG_industries', lin_reg, features=features5_industries)
build_model('LIN-REG_majors', lin_reg, features=features6_majors)
build_model('LIN-REG_majorInd', lin_reg, features=features7_majorind)
build_model('LIN-REG_noMajorInd', lin_reg, features=features8_nomajorind)
build_model('LIN-REG_all', lin_reg, features=features9_all)

model_comparison

# Using all features yields the best results

In [None]:
# Try different algorithms and compare models

build_model('LASSO_all', lasso_reg, features=features9_all)
build_model('RIDGE_all', ridge_reg, features=features9_all)
build_model('SGD_all', SGD_reg, features=features9_all)
build_model('GB_all', GB_reg, features=features9_all)

model_comparison

# Gradient boosting yields the best results

In [None]:
# features_train, features_test, salary_train, salary_test

X = model_train_full[model_features]
y = model_train_full['salary']
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(X):
    features_train, features_test = X.iloc[train_index], X.iloc[test_index]
    salary_train, salary_test = y[train_index], y[test_index]
    GB_reg.fit(features_train, salary_train)
    training_r2 = GB_reg.score(features_train, salary_train)
    print(training_r2)
    
    predicted_salary = GB_reg.predict(features_test)
    test_r2 = r2_score(y_true=salary_test, y_pred=predicted_salary)
    mse = mean_squared_error(y_true=salary_test, y_pred=predicted_salary)
    print(test_r2)
    print(mse)

In [None]:
neg_mse = cross_val_score(GB_reg, X, y,  cv=3, scoring='neg_mean_squared_error')
avg_mse = sum(neg_mse) / len(neg_mse) * -1.0
print(avg_mse)

### ---- 10 Select best model  ----

In [None]:
#select the model with the lowest error as your "production" model

Based on the comparisons, the model that will be used for production is gradient boosting with 500 iterations and a max depth of 6.  The R^2 is around .76 and the mean squared error is around 356.

## Part 4 - DEPLOY

### ---- 11 Automate pipeline ----

In [None]:
#write script that trains model on entire training set, saves model to disk,
#and scores the "test" dataset

In [None]:
# Process training data

clean_data(training_data)
engineer_features(training_data)
build_model('production_model', GB_reg, features=features9_all)
GB_reg.predict(test_features)

In [None]:
# Make corresponding changes to test data

engineer_features(test_features)

### ---- 12 Deploy solution ----

In [None]:
#save your prediction to a csv file or optionally save them as a table in a SQL database
#additionally, you want to save a visualization and summary of your prediction and feature importances
#these visualizations and summaries will be extremely useful to business stakeholders

### ---- 13 Measure efficacy ----

We'll skip this step since we don't have the outcomes for the test data