## **Topic: Inferential statistics**
**Agenda:** Salary Prediction

**Description:** We are being hired by a recruiting agency to create a model that can predict the salary of an employee based on his/her's years of experience

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Salary_Data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 608.0 bytes


In [4]:
data.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [8]:
# Rules for implementing Regression with sci-kit learn package
# |---- 1. Feature and Label must be in the form of an numpy array (pd.DataFrame -> pd.DataFrame.values)
# |---- 2. Feature must be a 2d array
# |---- 3. Label must be a 2d array
#
# Regression basics:
# |---- {many:one}::{features:label}

In [9]:
# Seperate `data` as `features` and `label`
features = data.iloc[:,[0]].values
label = data.iloc[:,[1]].values

In [10]:
features

array([[ 1.1],
       [ 1.3],
       [ 1.5],
       [ 2. ],
       [ 2.2],
       [ 2.9],
       [ 3. ],
       [ 3.2],
       [ 3.2],
       [ 3.7],
       [ 3.9],
       [ 4. ],
       [ 4. ],
       [ 4.1],
       [ 4.5],
       [ 4.9],
       [ 5.1],
       [ 5.3],
       [ 5.9],
       [ 6. ],
       [ 6.8],
       [ 7.1],
       [ 7.9],
       [ 8.2],
       [ 8.7],
       [ 9. ],
       [ 9.5],
       [ 9.6],
       [10.3],
       [10.5]])

In [11]:
label

array([[ 39343.],
       [ 46205.],
       [ 37731.],
       [ 43525.],
       [ 39891.],
       [ 56642.],
       [ 60150.],
       [ 54445.],
       [ 64445.],
       [ 57189.],
       [ 63218.],
       [ 55794.],
       [ 56957.],
       [ 57081.],
       [ 61111.],
       [ 67938.],
       [ 66029.],
       [ 83088.],
       [ 81363.],
       [ 93940.],
       [ 91738.],
       [ 98273.],
       [101302.],
       [113812.],
       [109431.],
       [105582.],
       [116969.],
       [112635.],
       [122391.],
       [121872.]])

#### **ML Coding Begins from here...**

In [None]:
# Steps:
# |---- 1. Create Train-Test split
# |---- 2. Build the model using `train` split
# |---- 3. Check the quality of the model
# |---- 4. Deploy the model (optional stage in ML engineering as deployment is usually handled by app/dev team)

In [12]:
# 1. Create Train-Test Split
# |---- Train Split is 80% and test split is 20%
# |---- Parameter `test_size` is responsible for splitting data according to the given percentage
# |---- Logically it is clear that `train_size` in this case will be (1 - test_size)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=10)

In [13]:
# 2. Build (i.e. train) the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
# fit(featureArray, labelArray) is responsible to accept the training data and return the trained object

In [14]:
# 3. Check the quality of the model
#
# Explanation:
# |---- SL(Significance Level) or Alpha Value: defines the error tolerance in the project (discussed with the business expert or the data scientist)
# |---- SL = 0.05
#
# Quality of the model:
# |---- As an ML engineer, our goal is to produce a `GENERALIZED` model
# |---- A `Generalized Model` means that the trained model must work best with both the known and the unknown data/features

In [15]:
# Guidelines by the instructor - A generalized model is something that satisfies the following logic:
# |---- testScore > trainScore and testScore >= CL
# |
# |---- testScore: the evaluation score calculated using the testing dataset
# |---- trainScore: the evaluation score calculated usuing the training dataset
# |---- CL is confidence level (1-SL)

In [17]:
# Extract the testScore and trainScore using score(feature,label) function

testScore = model.score(X_test,y_test)
trainScore = model.score(X_train,y_train)
print("TestScore is {} and trainScore is {}".format(testScore,trainScore))

CL = 0.95
if testScore > trainScore and testScore >= CL:
  print("Approve the model")
else:
  print("Discard the model")

TestScore is 0.9816423482070253 and trainScore is 0.9494673013344646
Approve the model


In [18]:
# LOGIC:
# |---- y = mx + c
# |
# |---- salary = b0 + b1(yExp)
# |
# |---- Extract b0 and b1 from model
# |---- salary = 26089.09663242 + (9356.86299354 * yExp)

print(model.coef_)
print(model.intercept_)

[[9356.86299354]]
[26089.09663242]


In [22]:
# 4. Deploy the model
#
# Model is acceptable and can be deployed
# Deployment: We will help the app dev team to understand how the model works with an app

yExp = float(input("Enter years of Experience: "))
predictedSalary = model.predict(np.array([[yExp]]))
print("Predicted Salary is ${}".format(predictedSalary[0][0]))

Enter years of Experience: 5
Predicted Salary is $72873.41160011351


In [24]:
import pickle
pickle.dump(model , open("SalaryPredictor.mdl",'wb'))