# Machine Learning With Python: Linear Regression Multiple Variables

In [19]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [20]:
df = pd.read_csv('hiring.csv')
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


### Data Preprocessing: Fill NA values with median value of a column

In [21]:
df.describe()

Unnamed: 0,test_score(out of 10),interview_score(out of 10),salary($)
count,7.0,8.0,8.0
mean,7.857143,7.875,63000.0
std,1.345185,1.642081,11501.55269
min,6.0,6.0,45000.0
25%,7.0,6.75,57500.0
50%,8.0,7.5,63500.0
75%,8.5,9.25,70500.0
max,10.0,10.0,80000.0


In [22]:
df["experience"]

0       NaN
1       NaN
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [23]:
df.columns = ['experience', 'written', 'interview','salary']

In [24]:
df

Unnamed: 0,experience,written,interview,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [25]:
new_df= df.fillna({"experience":"zero",
                   "written":df.written.median()})


In [26]:
new_df

Unnamed: 0,experience,written,interview,salary
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [27]:
def wordtonum(x):
    if(x=="zero"):
        return 0
    elif(x=="one"):
        return 1
    elif(x=="two"):
        return 2
    elif(x=="three"):
        return 3
    elif(x=="four"):
        return 4
    elif(x=="five"):
        return 5
    elif(x=="six"):
        return 6
    elif(x=="seven"):
        return 7
    elif(x=="eight"):
        return 8
    elif(x=="nine"):
        return 9
    elif(x=="ten"):
        return 10
    elif(x=="eleven"):
        return 11
    

new_df["experience"]=new_df["experience"].apply(wordtonum)
non_string_df= new_df
non_string_df

Unnamed: 0,experience,written,interview,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [28]:
non_string_df.describe()

Unnamed: 0,experience,written,interview,salary
count,8.0,8.0,8.0,8.0
mean,4.75,7.875,7.875,63000.0
std,4.26782,1.246423,1.642081,11501.55269
min,0.0,6.0,6.0,45000.0
25%,1.5,7.0,6.75,57500.0
50%,4.0,8.0,7.5,63500.0
75%,7.75,8.25,9.25,70500.0
max,11.0,10.0,10.0,80000.0


In [29]:
non_string_df = non_string_df.replace({"experience":0
},np.NaN)
non_string_df

Unnamed: 0,experience,written,interview,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,5.0,6.0,7,60000
3,2.0,10.0,10,65000
4,7.0,9.0,6,70000
5,3.0,7.0,10,62000
6,10.0,8.0,7,72000
7,11.0,7.0,8,80000


In [30]:
non_string_df= non_string_df.fillna({"experience":non_string_df.written.median()
                   })
non_string_df

Unnamed: 0,experience,written,interview,salary
0,8.0,8.0,9,50000
1,8.0,8.0,6,45000
2,5.0,6.0,7,60000
3,2.0,10.0,10,65000
4,7.0,9.0,6,70000
5,3.0,7.0,10,62000
6,10.0,8.0,7,72000
7,11.0,7.0,8,80000


In [31]:
reg = linear_model.LinearRegression()
reg.fit(non_string_df.drop('salary',axis='columns'),non_string_df.salary)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
reg.coef_

array([1446.40832732,  620.63279398, 1755.02421931])

In [33]:
reg.intercept_

34528.4448108832

In [34]:
reg.predict([[2, 9, 6]])

array([53537.10192724])

In [35]:
reg.predict([[12, 10, 10]])

array([75641.91487169])