# Project: Recommend candidate's salary, during hiring process - [Hiring]

In [1]:
# Import Libraries

# Data science
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting Pretty figures and avoiding blurry images
%config InlineBackend.figure_format = 'retina'

# Larger scale for plots in notebooks
sns.set_context('notebook')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Word to number
from word2number import w2n


In [2]:
# Load hiring dataset

df = pd.read_csv('hiring.csv')
df.head()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


### Some basic data exploration 

In [3]:


# Check the shape
df.shape

(8, 4)

The data contains 8 samples and 4 columns

In [4]:
# Check Data types including the non null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   experience                  6 non-null      object 
 1   test_score(out of 10)       7 non-null      float64
 2   interview_score(out of 10)  8 non-null      int64  
 3   salary($)                   8 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 384.0+ bytes


In [5]:
# Check the statistical summary of the dataframe
df.describe()

Unnamed: 0,test_score(out of 10),interview_score(out of 10),salary($)
count,7.0,8.0,8.0
mean,7.857143,7.875,63000.0
std,1.345185,1.642081,11501.55269
min,6.0,6.0,45000.0
25%,7.0,6.75,57500.0
50%,8.0,7.5,63500.0
75%,8.5,9.25,70500.0
max,10.0,10.0,80000.0


In [6]:
# Check for missing data
df.isna().sum()

experience                    2
test_score(out of 10)         1
interview_score(out of 10)    0
salary($)                     0
dtype: int64

In [7]:
# Check for columns

df.columns

Index(['experience', 'test_score(out of 10)', 'interview_score(out of 10)',
       'salary($)'],
      dtype='object')

### Data Wrangling

In [8]:
# Rename the test_score(out of 10) and the interview_score(out of 10) to a more simple name

df.rename({'test_score(out of 10)': 'test_score','interview_score(out of 10)': 'interview_score','salary($)': 'salary'}, axis= 1, inplace=True )

In [9]:
# Fill in the missing data in test_score column with the median

# find the median value and attach it to test_median
test_median = df['test_score'].median()
# Fill the missing value with test_median
df['test_score'] = df['test_score'].fillna(test_median)


In [10]:
# Fill in the missing data in Experience column with the zero

df['experience'] = df['experience'].fillna('Zero')

In [11]:
# Convert number words to numeric digit
df['experience'] = df['experience'].apply(w2n.word_to_num)

In [12]:
# Check the Dataframe for the effect
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000


#### Model Building

In [13]:
# Seperate our X and y (independent and dependent variable)

X = df.drop('salary', axis=1)
y = df['salary']

In [14]:
# View X DataFrame
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,8.0,7
7,11,7.0,8


In [15]:
# View y DataFrame
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [16]:
# Split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=0)

In [17]:
lmreg = LinearRegression()

In [18]:
# Fit the multiple lines on the data points and returns the line that
# results in the least error

lmreg.fit(X_train, y_train)

LinearRegression()

In [19]:
lmreg.score(X_train, y_train)

0.9926622491065888

In [20]:
pred = lmreg.predict(X_test)
pred

array([78186.17460755, 57496.5574222 ])

In [21]:
pred_y = lmreg.predict([[10,5,7]])
pred_y

array([70459.37758193])