# Use-case: An HR company has hired you as an AI engineer. Your goal is to create a model that can predict the salary of the employee based on employee's yearsOfExperience

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('Salary_Data.csv')

In [4]:
data.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 624.0 bytes


In [7]:
data.dropna(inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 720.0 bytes


In [9]:
# Rules for Regression
# 1. Data must be complete
# 2. Data must be strictly numeric
# 3. Feature column must be represented in the form of 2d np array
# 4. Label column must be represented in the form of 2d np array

In [10]:
#Create Feature and label set

features = data.iloc[:,[0]].values
label = data.iloc[:,[1]].values

In [11]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=10)

In [12]:
#Modelling

from sklearn.linear_model import LinearRegression
model = LinearRegression()

#training the model
model.fit(X_train,y_train)

LinearRegression()

In [13]:
#Check the quality of the model
# 1. Check for Generalization
# 2. Compare accuracy with CL

In [14]:
# 1. Check for Generalization

print("Training Score is {} and Testing score is {}".format(model.score(X_train,y_train),model.score(X_test,y_test)))

Training Score is 0.9494673013344644 and Testing score is 0.9816423482070253


In [15]:
#Since testing score > training score, therefore my model is a generalized model

In [16]:
# 2. Compare accuracy with CL
# SL = 0.05
# CL = 1 - SL = 0.95

# TestScore >= CL ---- TRue therefore model is a good quality model !!!

In [23]:
# Deployment Check for App

# Get the user input

yearsExperience= float(input("Enter Years of experience: "))

# Convert input into numpy 2d array

yearsExperienceNP = np.array([[yearsExperience]])

# Prediction

print("Salary Predicted by model is ${}".format(round(model.predict(yearsExperienceNP)[0][0])))

Enter Years of experience: 3
Salary Predicted by model is $54160.0


In [24]:
# Deploy all relevant objects
import pickle
pickle.dump(model,open('SalaryPredictor.mdl','wb'))