# Use-case: An HR company has hired you as an AI engineer. Your goal is to create a model that can predict the salary of the employee based on employee's yearsOfExperience

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Salary_Data.csv')

In [3]:
data.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 624.0 bytes


In [5]:
data.dropna(inplace=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 720.0 bytes


In [7]:

# Rules for Regression ---- DEEP LEARNING
# =================================================================
# 1. Data must be complete
# 2. Data must be strictly numeric
# 3. Feature column must be represented in the form of 2d np array
# 4. Label column must be represented in the form of 2d np array
# 5. Normalize Features and label 

# Guidelines
#--------------------------------------------------------------------------------
# 1. Features must be normalized (Always use StandardScaler or RobustScaler)
# 2. Label must be normalized (Always use MinMaxScaler with range 0-1)


In [8]:
#Create Feature and label set

features = data.iloc[:,[0]].values
label = data.iloc[:,[1]].values

In [9]:
#label #39343.0.01904087

In [10]:
# Standardization/Normalization

#Features
from sklearn.preprocessing import StandardScaler
scFeatures = StandardScaler()
features = scFeatures.fit_transform(features)


#Label
from sklearn.preprocessing import MinMaxScaler
minmaxLabel = MinMaxScaler()
label = minmaxLabel.fit_transform(label)

In [11]:
minmaxLabel.inverse_transform(np.array([[0.01904087]]))

array([[39343.0000542]])

In [12]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=10)

## Modelling Starts ....

In [13]:
tf.__version__

NameError: ignored

In [14]:
#Step1: Archiect the model

#pip install tensorflow

import tensorflow as tf


#Step1:Create a Sequential Object

model = tf.keras.models.Sequential()

#Step2: Create Dense Layers
# units = No of Neurons in Hidden Layer
# activation = Which Activation Function to apply for Hidden Layer/Output Layer
# input_shape = Applicable for first layer to specify number of column inputs in training set


# Input Layer1 containing one neuron followed by Hidden layer1 
# containing 100 neurons with activation fn Sigmoid
model.add( tf.keras.layers.Dense( units=100 , activation='relu', input_shape=(1,)) )
model.add( tf.keras.layers.Dense( units=100 , activation='relu') )
model.add( tf.keras.layers.Dense( units=100 , activation='relu') )
model.add( tf.keras.layers.Dense( units=1 , activation='linear') )






In [15]:
#Create a custom r2 function

def r2_keras_custom(y_true,y_pred):
    SS_res = tf.keras.backend.sum(tf.keras.backend.square(y_true-y_pred))
    SS_total = tf.keras.backend.sum(tf.keras.backend.square(y_true - tf.keras.backend.mean(y_true)))
    
    return  (1 - (SS_res / SS_total + tf.keras.backend.epsilon()))

In [16]:
#Step2: Compile Model
# This allows you to initialize 
# 1. optimizer = BackPropogation Algorithm (Stochastic Gradient Descent)
# 2. loss = Specify the loss function
# 3. metrics = Other metrics that can help


model.compile(optimizer="sgd",
             loss="mean_squared_error",
             metrics=[r2_keras_custom])

In [17]:
# Create Custom EarlyStopping Condition --- Callbacks
# I want my model to be generalized and my testScore >= CL
# Assume for this use-case : SL = 0.05
# CL = 1 - SL = 0.95


class MyThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self,cl):
        super(MyThresholdCallback,self).__init__()
        self.cl=cl
        
    def on_epoch_end(self, epoch, logs=None):
        testScore = logs['val_r2_keras_custom']
        trainScore = logs['r2_keras_custom']
        
        if testScore > trainScore and testScore >= self.cl:
            self.model.stop_training = True

In [18]:
#Step3: Fit the model

CL = 0.95

model.fit(X_train,
         y_train,
         epochs= 1000,
         validation_data=(X_test,y_test),
         callbacks=[MyThresholdCallback(cl=CL)])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x7f2107f8fb90>

In [None]:
#Check the quality of the model
# 1. Check for Generalization
# 2. Compare accuracy with CL

In [19]:
# 1. Check for Generalization

print("Training Score is {} and Testing score is {}".format(model.evaluate(X_train,y_train)[1],
                                                            model.evaluate(X_test,y_test)[1]))

Training Score is 0.9653823375701904 and Testing score is 0.96538907289505


In [20]:
#Since testing score > training score, therefore my model is a generalized model

In [21]:
# 2. Compare accuracy with CL
# SL = 0.05
# CL = 1 - SL = 0.95

# TestScore >= CL ---- TRue therefore model is a good quality model !!!

In [22]:
# Deployment Check for App

# Get the user input

yearsExperience= float(input("Enter Years of experience: "))

# Convert input into numpy 2d array

yearsExperienceNP = np.array([[yearsExperience]])

# Standardize it

yExpStd = scFeatures.transform(yearsExperienceNP)

# Prediction

output = model.predict(yExpStd)
actualOutput = minmaxLabel.inverse_transform(output)

print("Salary Predicted by model is ${}".format(actualOutput[0][0]))

Enter Years of experience: 3
Salary Predicted by model is $53318.09375


In [23]:
# Deploy all relevant objects

model.save('SalPredDL')

INFO:tensorflow:Assets written to: SalPredDL/assets


In [30]:
tf.keras.models.save_model(model, 'sal_pred')

INFO:tensorflow:Assets written to: sal_pred/assets


In [24]:
import pickle
pickle.dump(scFeatures, open('FeatureNormalizer.nm','wb'))
pickle.dump(minmaxLabel, open('LabelNormalizer.nm','wb'))

In [25]:
pickle.dump(r2_keras_custom, open('r2_keras_custom.fn','wb'))

In [29]:
! zip sal.zip SalPredDL/*

  adding: SalPredDL/assets/ (stored 0%)
  adding: SalPredDL/keras_metadata.pb (deflated 89%)
  adding: SalPredDL/saved_model.pb (deflated 88%)
  adding: SalPredDL/variables/ (stored 0%)


In [31]:
model.save('my_tf_model')

INFO:tensorflow:Assets written to: my_tf_model/assets


In [33]:
model.save('test.h5')