# Business Problem - Predict the Price of Bangalore House
Using Linear Regression - Supervised Machine Learning Algorithm

### Load Libraries

In [1]:
import pandas as pd

### Load Data

In [2]:
df = pd.read_csv("house.csv")

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df1=df.copy()
df1.drop(['area_type','society','availability','balcony'],axis=1,inplace=True)


In [5]:
df1

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00
2,Uttarahalli,3 BHK,1440,2.0,62.00
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00
4,Kothanur,2 BHK,1200,2.0,51.00
...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00
13316,Richards Town,4 BHK,3600,5.0,400.00
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00


In [6]:
df1.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [7]:
df2=df1.dropna()

In [8]:
df3=df2.copy()
df3['BHK']=df2['size'].apply(lambda x:int(x.split()[0]))
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [9]:
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
BHK           0
dtype: int64

In [10]:
df3[df3.BHK <20]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,51.00,2
...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00,5
13316,Richards Town,4 BHK,3600,5.0,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00,4


In [12]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [14]:
df3[df3['total_sqft'].apply(is_float)].head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [17]:
df4=df3.copy() 
df4['price_per_sqft']=df4['price']*100 / df4['total_sqft']
df4.head()

TypeError: unsupported operand type(s) for /: 'float' and 'str'

In [18]:
df4.location=df4.location.apply(lambda x:x.strip())
location_stats=df4['location'].value_counts(ascending=False)
location_stats

Whitefield         535
Sarjapur  Road     392
Electronic City    304
Kanakpura Road     266
Thanisandra        236
                  ... 
Jagadish Nagar       1
Haralur Road,        1
Anantapuram          1
BAGUR                1
Gollarahatti         1
Name: location, Length: 1293, dtype: int64

In [30]:
location_stats_less_than_10=location_stats[location_stats<=20]

location_stats_less_than_10

Yelachenahalli          20
Binny Pete              20
Poorna Pragna Layout    20
HBR Layout              20
Sanjay nagar            20
                        ..
Jagadish Nagar           1
Haralur Road,            1
Anantapuram              1
BAGUR                    1
Gollarahatti             1
Name: location, Length: 1150, dtype: int64

In [31]:
df4.location=df4.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x )
len(df4.location.unique())

97

In [32]:
df5=df4.copy()
dummies=pd.get_dummies(df5['location'])

In [33]:
df6=df5.copy()
df6=pd.concat([df5,dummies.drop('other',axis=1)],axis=1)

In [35]:
df6.isnull().sum()
df6=df6.dropna()

In [42]:
X=df6.drop(['price','location','BHK','size'],axis=1)
X

Unnamed: 0,total_sqft,bath,5th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,Akshaya Nagar,Ambedkar Nagar,Anekal,Attibele,...,Tumkur Road,Uttarahalli,Varthur,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelahanka,Yelahanka New Town,Yeshwanthpur
0,1056,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440,2.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1521,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315,3453,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
13316,3600,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13317,1141,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13318,4689,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:

y = df6['price']



In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (10596, 98)
Shape of y_train =  (10596,)
Shape of X_test =  (2650, 98)
Shape of y_test =  (2650,)


In [45]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

ValueError: could not convert string to float: '1210 - 1477'

### Split Data

In [None]:
X = df.drop('price', axis=1)
y = df['price']

print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

In [None]:
df['availability']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## Linear Regression - ML Model Training

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(X_train, y_train)

In [None]:
lr.coef_

In [None]:
lr.intercept_

## Predict the value of Home and Test

In [None]:
X_test[0, :]

In [None]:
lr.predict([X_test[0, :]])

In [None]:
lr.predict(X_test)

In [None]:
y_test

In [None]:
lr.score(X_test, y_test)

In [None]:
net = gcv.model_zoo.get_model('yolo3_darknet53_voc', pretrained=True)

In [None]:
net = model_zoo.get_model('yolo3_darknet53_voc', pretrained=True)