# Linear Regression With Multiple Variable

<img src='general_equation_m_variable.jpg'>

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [33]:
df = pd.read_csv('./data/homeprices2.csv')
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [7]:
# Here, x1=area, x2=bedrooms, x3=age

In [13]:
df.isnull().sum()

area        0
bedrooms    1
age         0
price       0
dtype: int64

#### Data Preprocessing: *Fill NA values with median value of bedrooms column*

In [4]:
df.bedrooms.median()

4.0

In [34]:
df.bedrooms = df.bedrooms.fillna(df.bedrooms.median())
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [35]:
X = df.drop('price',axis='columns')
Y = df['price']
reg = linear_model.LinearRegression()
reg.fit(X,Y)

In [36]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [37]:
reg.intercept_

221323.00186540408

#### Predicting

In [38]:
reg.predict([[3000, 3, 40]])   # Price of home with 3000 sqr ft area, 3 bedrooms, 40 year old



array([498408.25158031])

In [39]:
112.06244194*3000 + 23388.88007794*3 + -3231.71790863*40 + 221323.00186540384 # Verifying with eqn

498408.25157402386

In [40]:
reg.predict([[2500, 4, 5]])     # Price of home with 2500 sqr ft area, 4 bedrooms, 5 year old



array([578876.03748933])

## *Another Exercise*

#### *Building a machine learning model for HR department that can help them decide salaries for future candidates.

In [42]:
d = pd.read_csv("./data/hiring.csv")
d

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


### Data Preprocessing

In [43]:
d['experience'] = d['experience'].fillna("zero")  #fixing na values of experience column
d

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [22]:
from word2number import w2n


d.experience = d.experience.apply(w2n.word_to_num)  # word to number
d

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [23]:
import math

median_test_score = math.floor(d['test_score(out of 10)'].mean())
median_test_score

7

In [24]:
d['test_score(out of 10)'] = d['test_score(out of 10)'].fillna(median_test_score)
d

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000


In [25]:
reg = linear_model.LinearRegression()
reg.fit(d.drop('salary($)',axis='columns'),d['salary($)'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
reg.predict([[2,9,6]])       # predicting for 2 yr experience, 9 test score, 6 interview score

array([53713.86677124])

In [27]:
reg.predict([[12,10,10]])    # Predicting for 12 yr experience, 10 test score, 10 interview score

array([93747.79628651])