# Generate Linear Regression Model to predict Salary

In [253]:
# imports

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt 
import seaborn as sns 

import pickle as pkl

### Preprocess the data

In [254]:
# read the data

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,name,password,employee_id,phone_number,job_role,work_location,salary,manager_id
0,Alan Schnitzer,Password123!,10000,730-415-0968,CEO,"Hartford, CT",16778800.0,10000
1,Evelyn Smith,Password123!,10001,604-745-4842,Data Engineer Manager,"Hartford, CT",189612.01,10000
2,Noah Hernandez,Password123!,10002,689-440-5017,Senior Data Engineer,"Saint Paul, MN",149812.69,10001
3,Theodore Martinez,Password123!,10003,886-613-9930,Senior Data Engineer,"Hunt Valley, MD",164232.48,10001
4,Noah Jackson,Password123!,10004,860-141-3394,Junior Data Engineer,"Saint Paul, MN",99611.94,10001


In [255]:
df = df.iloc[1:,] # drop Alan for graphing purposes (hes an outlier)
df.head()

Unnamed: 0,name,password,employee_id,phone_number,job_role,work_location,salary,manager_id
1,Evelyn Smith,Password123!,10001,604-745-4842,Data Engineer Manager,"Hartford, CT",189612.01,10000
2,Noah Hernandez,Password123!,10002,689-440-5017,Senior Data Engineer,"Saint Paul, MN",149812.69,10001
3,Theodore Martinez,Password123!,10003,886-613-9930,Senior Data Engineer,"Hunt Valley, MD",164232.48,10001
4,Noah Jackson,Password123!,10004,860-141-3394,Junior Data Engineer,"Saint Paul, MN",99611.94,10001
5,William Downey,Password123!,10005,685-012-2885,Senior Data Engineer,"Hartford, CT",180185.58,10001


In [256]:
df_data = df[['job_role','work_location']] # values used to make prediction
df_labels = df[['salary']] # value to predict

In [257]:
# one hot encode data to be able to run model

df_dummy = pd.get_dummies(df_data)
df_dummy.head()

Unnamed: 0,job_role_Data Engineer Manager,job_role_HR Manager,job_role_HR Representative,job_role_HR Specialist,job_role_Junior Data Engineer,job_role_Junior Software Engineer,job_role_Senior Data Engineer,job_role_Senior Software Engineer,job_role_Software Engineer Manager,"work_location_Hartford, CT","work_location_Hunt Valley, MD","work_location_New York, NY","work_location_Saint Paul, MN"
1,1,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,1,0,0,1,0,0,0


### Split data into train and test

In [258]:
X_train, X_test, y_train, y_test = train_test_split(df_dummy, df_labels, test_size=0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(799, 13) (200, 13) (799, 1) (200, 1)


### Train Linear Regression Model

In [259]:
model = LinearRegression()

In [260]:
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.9628077422639546

### Testing Linear Regression Model

In [261]:
score = model.score(X_test, y_test)
score

0.9593208142912873

In [262]:
# # code from https://stackoverflow.com/questions/52404857/how-do-i-plot-for-multiple-linear-regression-model-using-matplotlib

# fig, axes = plt.subplots(1,len(X_train.columns.values),sharey=True,constrained_layout=True,figsize=(30,15))

# for i,e in enumerate(X_train.columns):
#   model.fit(X_train[e].values[:,np.newaxis], y_train.values)
#   axes[i].set_title("Best fit line")
#   axes[i].set_xlabel(str(e))
#   axes[i].set_ylabel('Salary')
#   axes[i].scatter(X_train[e].values[:,np.newaxis], y_train,color='g')
#   axes[i].plot(X_train[e].values[:,np.newaxis], 
#   model.predict(X_train[e].values[:,np.newaxis]),color='k')

### Saving the Model

In [263]:
with open("trained_model.pkl", "wb") as f:
    pkl.dump(model,f)

### Taking in External Values to Predict Salary 

In [264]:
def predict_salary(job_role, work_location):
    df_input_data = df[['job_role','work_location']] # values used to make prediction

    df_input_data.loc[len(df_input_data.index)] = [job_role, work_location] # append new data to existing dataframe
    df_input_dummy = pd.get_dummies(df_input_data) # one hot encode all data

    model_prediction = model.predict(df_input_dummy.tail(1)) # make salary prediction on new input
    return np.array2string(model_prediction[0][0])

In [265]:
print("Enter in your job role (eg. Senior Software Engineer)")
job_role = input()
print("Enter in your job location (eg. Hartford, CT)")
work_location = input()

print('The salary for a {} in {} is predicted to be {}'.format(job_role, work_location, predict_salary(job_role, work_location)[:-1]))

Enter in your job role (eg. Senior Software Engineer)
Enter in your job location (eg. Hartford, CT)
The salary for a Senior Data Engineer in Hartford, CT is predicted to be 171867.6647718


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input_data.loc[len(df_input_data.index)] = [job_role, work_location] # append new data to existing dataframe
