In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pickle
# pickle is used to save the model created by us

In [3]:
df = pd.read_csv("hiring.csv")
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [4]:
df.isna().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [5]:
# experience
df['experience'].fillna(0, inplace=True)
df.isna().sum()

experience         0
test_score         1
interview_score    0
salary             0
dtype: int64

In [6]:
df['test_score'].mean()

7.857142857142857

In [7]:
df['test_score'] = df['test_score'].fillna(df['test_score'].mean())
df.isna().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

#### Dataset is clean now.

In [8]:
df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,7.857143,7,72000
7,eleven,7.0,8,80000


In [9]:
X = df.iloc[:,:-1]
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6
5,three,7.0,10
6,ten,7.857143,7
7,eleven,7.0,8


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       8 non-null      object 
 1   test_score       8 non-null      float64
 2   interview_score  8 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 320.0+ bytes


In [11]:
df.experience.unique()

array([0, 'five', 'two', 'seven', 'three', 'ten', 'eleven'], dtype=object)

In [12]:
def conv(x):
    dict = {
        0 : 0,
        'five' : 5,
        'two' : 2,
        'seven' : 7,
        'three' : 3,
        'ten' : 10,
        'eleven' : 11
    }
    return dict[x]

In [13]:
X['experience'] = X['experience'].apply(lambda x : conv(x))
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       8 non-null      int64  
 1   test_score       8 non-null      float64
 2   interview_score  8 non-null      int64  
dtypes: float64(1), int64(2)
memory usage: 320.0 bytes


In [15]:
y = df.iloc[:,-1]
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

Here, the dataset is very small. Hence I am not going to do train_test_split. However, it is strongly recommended to do it in an actual problem statement.

You can try the same on your boombikes assignment later on as a part of practice.

In [16]:
# Modeling

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [17]:
# Fit the model
lr.fit(X, y)

LinearRegression()

In [18]:
lr

LinearRegression()

In [19]:
# Prediction Phase

y_pred = lr.predict(X)
y_pred

array([52313.61238494, 45722.68644263, 58231.95591138, 63991.7318464 ,
       67429.06277517, 61080.55179794, 75922.72532666, 79307.67351488])

In [20]:
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [22]:
list(zip(y, y_pred))

[(50000, 52313.61238494102),
 (45000, 45722.68644262674),
 (60000, 58231.9559113813),
 (65000, 63991.73184640055),
 (70000, 67429.06277516519),
 (62000, 61080.55179794243),
 (72000, 75922.72532666176),
 (80000, 79307.673514881)]

In [23]:
from sklearn.metrics import r2_score
r2_score(y, y_pred)

0.9639958361860579

In [24]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [25]:
lr.predict([[3,9,7]])

array([58315.5019167])

In [26]:
lr.predict([[10,10,10]])

array([86612.80419155])

In [28]:
lr.predict([[10,2,3]])

array([55930.47274854])

# Model Deployment

In [29]:
# Here, we will save the 'lr' model to disk as 'model.py'

import pickle
pickle.dump(lr,open('model.pkl','wb'))
# Dump this model by the name 'model.py' in the sytems HDD/SSD and while doing this
# write this file using 'write bytes' mode
print("Model saved successfully!")

Model saved successfully!


# Client side

In [30]:
import pickle

srikant = pickle.load(open('model.pkl','rb'))
# rb - read bytes
srikant

LinearRegression()

In [31]:
srikant.predict([[3,9,7]])

array([58315.5019167])

In [32]:
srikant.predict([[10,10,10]])

array([86612.80419155])

In [33]:
srikant.predict([[10,2,3]])

array([55930.47274854])

# Happy Learning