In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

# Import data

In [2]:
data=pd.read_csv("house_price.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Location,BHK,Furnishing,Sq.ft,Old(years),Floor,Price
0,37,Bommanahalli,3,1,3000,1,3,28000
1,43,Bommanahalli,3,1,1650,10,0,18000
2,12,Whitefield,2,0,1000,5,3,16400
3,8,Whitefield,3,0,1600,1,9,27000
4,9,Whitefield,2,1,1200,5,1,20000


In [4]:
#remove first column
data=data.iloc[:,1:] #iloc we are taking row value as all but for column we are taking from 1 to rest all ie is dropping column at 0
data.head()

Unnamed: 0,Location,BHK,Furnishing,Sq.ft,Old(years),Floor,Price
0,Bommanahalli,3,1,3000,1,3,28000
1,Bommanahalli,3,1,1650,10,0,18000
2,Whitefield,2,0,1000,5,3,16400
3,Whitefield,3,0,1600,1,9,27000
4,Whitefield,2,1,1200,5,1,20000


In [5]:
# labelling the location value as labelfor the ease of analysis
enc=LabelEncoder()
data.iloc[:,0]=enc.fit_transform(data.iloc[:,0])

In [6]:
data.head()

Unnamed: 0,Location,BHK,Furnishing,Sq.ft,Old(years),Floor,Price
0,0,3,1,3000,1,3,28000
1,0,3,1,1650,10,0,18000
2,1,2,0,1000,5,3,16400
3,1,3,0,1600,1,9,27000
4,1,2,1,1200,5,1,20000


In [7]:
# X value that is the input data for prediction
X=data.iloc[:,:6]

In [8]:
X.head()

Unnamed: 0,Location,BHK,Furnishing,Sq.ft,Old(years),Floor
0,0,3,1,3000,1,3
1,0,3,1,1650,10,0
2,1,2,0,1000,5,3
3,1,3,0,1600,1,9
4,1,2,1,1200,5,1


In [9]:
# Y value is the output value that is pricing
Y= data.Price

In [10]:
Y.head()

0    28000
1    18000
2    16400
3    27000
4    20000
Name: Price, dtype: int64

In [11]:
# droping any dummy variable in X dataframe
X=pd.get_dummies(X,drop_first=True)

In [12]:
#dividing x data and y data in training and testing data
X_train, X_test, Y_train, Y_test= train_test_split( X, Y, test_size=0.33)

In [13]:
X_train

Unnamed: 0,Location,BHK,Furnishing,Sq.ft,Old(years),Floor
21,0,2,0,1033,5,0
381,0,3,1,1460,1,2
873,0,2,0,1135,10,2
632,0,3,0,1250,1,1
597,0,2,0,1135,10,2
668,0,3,0,1500,10,1
854,0,3,1,1650,10,0
921,1,3,1,1870,10,3
559,0,3,0,1250,1,1
143,0,2,1,1089,5,2


In [14]:
#fitting the training data to linear regression
linear=LinearRegression()
linear.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [15]:
# prdicting value for y using testing data as X_test and storing it to pred variable
pred=linear.predict(X_test)

In [16]:
# calculating r square value for best fitting line by providing Y_test value and pred value
r2_score(Y_test,pred)

0.8154355822956858

In [17]:
# now model is ready we can use the model in further prdiction of price for given value of data for a house
from sklearn.externals import joblib
joblib.dump(linear,"hp_model.ml")

['hp_model.ml']