# Generate Linear Regression Model to predict Salary

In [1]:
# imports

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import pickle as pkl

### Preprocess the data

In [2]:
# read the data

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,name,password,employee_id,phone_number,job_role,work_location,salary,manager_id
0,Alan Schnitzer,Password123!,10000,686-296-9780,CEO,"Hartford, CT",16778800.0,10000
1,William Lopez,Password123!,10001,861-357-0442,HR Professional,"Hunt Valley, MD",67344.95,10000
2,Noah Lettick,Password123!,10002,938-728-1518,HR Professional,"Hartford, CT",76182.8,10000
3,Charlotte Wilson,Password123!,10003,802-636-0595,Senior Data Engineer,"Hartford, CT",113577.14,10000
4,Liam Brown,Password123!,10004,825-370-8231,Junior Data Engineer,"Hartford, CT",85835.14,10000


In [3]:
df_data = df[['job_role','work_location']] # values used to make prediction
df_labels = df[['salary']] # value to predict

In [4]:
# one hot encode data to be able to run model

df_dummy = pd.get_dummies(df_data)
df_dummy.head()

Unnamed: 0,job_role_CEO,job_role_Cloud Engineer,job_role_HR Professional,job_role_Junior Data Engineer,job_role_Junior Software Engineer,job_role_Senior Data Engineer,job_role_Senior Software Engineer,"work_location_Hartford, CT","work_location_Hunt Valley, MD","work_location_New York, NY","work_location_Saint Paul, MN"
0,1,0,0,0,0,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,1,0,0,0
4,0,0,0,1,0,0,0,1,0,0,0


### Split data into train and test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_dummy, df_labels, test_size=0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800, 11) (200, 11) (800, 1) (200, 1)


### Train Linear Regression Model

In [6]:
model = LinearRegression()

In [7]:
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.9999474922904207

### Testing Linear Regression Model

In [8]:
prediction = model.score(X_test, y_test)
prediction

0.9669632783078448

### Saving the Model

In [9]:
with open("trained_model.pkl", "wb") as f:
    pkl.dump(model,f)

### Taking in External Values to Predict Salary 

In [10]:
def predict_salary(job_role, work_location):
    df_input_data = df[['job_role','work_location']] # values used to make prediction

    df_input_data.loc[len(df_input_data.index)] = [job_role, work_location] # append new data to existing dataframe
    df_input_dummy = pd.get_dummies(df_input_data) # one hot encode all data

    model_prediction = model.predict(df_input_dummy.tail(1)) # make salary prediction on new input
    return np.array2string(model_prediction[0][0])

In [11]:
print("Enter in your job role (eg. Cloud Engineer)")
job_role = 'Cloud Engineer' #input()
print("Enter in your job location (eg. Hartford, CT)")
work_location = 'Hartford, CT' #input()

print(predict_salary(job_role, work_location)[:-1])

Enter in your job role (eg. Cloud Engineer)
Enter in your job location (eg. Hartford, CT)
85760


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input_data.loc[len(df_input_data.index)] = [job_role, work_location] # append new data to existing dataframe
