## Task-1 : House Price Prediction

> Description: Implement a linear regression model to predict the price of the house

### Collecting and unzipping data

In [1]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d yasserh/housing-prices-dataset

mkdir: cannot create directory ‘/root/.kaggle’: File exists
housing-prices-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
import zipfile
data=zipfile.ZipFile('/content/housing-prices-dataset.zip')
data.extractall()
data.close()

### Importing the dependencies

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading and gathering the data

In [4]:
data=pd.read_csv('Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [6]:
data.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

### Preprocessing data

In [7]:
data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [8]:
for col in data.columns:
  if data[col].dtype==object:
    print(col)

mainroad
guestroom
basement
hotwaterheating
airconditioning
prefarea
furnishingstatus


In [9]:
# Handling mainroad columm
data['mainroad']

0      yes
1      yes
2      yes
3      yes
4      yes
      ... 
540    yes
541     no
542    yes
543     no
544    yes
Name: mainroad, Length: 545, dtype: object

In [10]:
mainroad_dummies=pd.get_dummies(data['mainroad'],drop_first=True)
mainroad_dummies.head()

Unnamed: 0,yes
0,1
1,1
2,1
3,1
4,1


In [11]:
mainroad_dummies.rename(columns = {'yes':'mainroad'}, inplace = True)

In [12]:
mainroad_dummies.head()

Unnamed: 0,mainroad
0,1
1,1
2,1
3,1
4,1


In [13]:
data.drop(columns=['mainroad'],inplace=True)
data=pd.concat([data,mainroad_dummies],axis='columns')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,mainroad
0,13300000,7420,4,2,3,no,no,no,yes,2,yes,furnished,1
1,12250000,8960,4,4,4,no,no,no,yes,3,no,furnished,1
2,12250000,9960,3,2,2,no,yes,no,no,2,yes,semi-furnished,1
3,12215000,7500,4,2,2,no,yes,no,yes,3,yes,furnished,1
4,11410000,7420,4,1,2,yes,yes,no,yes,2,no,furnished,1


In [14]:
# Handling guestroom column
guestroom_dummies=pd.get_dummies(data['guestroom'],drop_first=True)
guestroom_dummies.head()

Unnamed: 0,yes
0,0
1,0
2,0
3,0
4,1


In [15]:
guestroom_dummies.rename(columns = {'yes':'guestroom'}, inplace = True)
data.drop(columns=['guestroom'],inplace=True)
data=pd.concat([data,guestroom_dummies],axis='columns')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,mainroad,guestroom
0,13300000,7420,4,2,3,no,no,yes,2,yes,furnished,1,0
1,12250000,8960,4,4,4,no,no,yes,3,no,furnished,1,0
2,12250000,9960,3,2,2,yes,no,no,2,yes,semi-furnished,1,0
3,12215000,7500,4,2,2,yes,no,yes,3,yes,furnished,1,0
4,11410000,7420,4,1,2,yes,no,yes,2,no,furnished,1,1


In [16]:
# Handling basement column
basement_dummies=pd.get_dummies(data['basement'],drop_first=True)
basement_dummies.head()

Unnamed: 0,yes
0,0
1,0
2,1
3,1
4,1


In [17]:
basement_dummies.rename(columns = {'yes':'basement'}, inplace = True)
data.drop(columns=['basement'],inplace=True)
data=pd.concat([data,basement_dummies],axis='columns')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,mainroad,guestroom,basement
0,13300000,7420,4,2,3,no,yes,2,yes,furnished,1,0,0
1,12250000,8960,4,4,4,no,yes,3,no,furnished,1,0,0
2,12250000,9960,3,2,2,no,no,2,yes,semi-furnished,1,0,1
3,12215000,7500,4,2,2,no,yes,3,yes,furnished,1,0,1
4,11410000,7420,4,1,2,no,yes,2,no,furnished,1,1,1


In [18]:
# Handling hotwaterheating
hotwaterheating_dummies=pd.get_dummies(data['hotwaterheating'],drop_first=True)
hotwaterheating_dummies.head()

Unnamed: 0,yes
0,0
1,0
2,0
3,0
4,0


In [19]:
hotwaterheating_dummies.rename(columns = {'yes':'hotwaterheating'}, inplace = True)
data.drop(columns=['hotwaterheating'],inplace=True)
data=pd.concat([data,hotwaterheating_dummies],axis='columns')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,airconditioning,parking,prefarea,furnishingstatus,mainroad,guestroom,basement,hotwaterheating
0,13300000,7420,4,2,3,yes,2,yes,furnished,1,0,0,0
1,12250000,8960,4,4,4,yes,3,no,furnished,1,0,0,0
2,12250000,9960,3,2,2,no,2,yes,semi-furnished,1,0,1,0
3,12215000,7500,4,2,2,yes,3,yes,furnished,1,0,1,0
4,11410000,7420,4,1,2,yes,2,no,furnished,1,1,1,0


In [20]:
# Handling airconditioning
airconditioning_dummies=pd.get_dummies(data['airconditioning'],drop_first=True)
airconditioning_dummies.head()

Unnamed: 0,yes
0,1
1,1
2,0
3,1
4,1


In [21]:
airconditioning_dummies.rename(columns = {'yes':'airconditioning'}, inplace = True)
data.drop(columns=['airconditioning'],inplace=True)
data=pd.concat([data,airconditioning_dummies],axis='columns')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,prefarea,furnishingstatus,mainroad,guestroom,basement,hotwaterheating,airconditioning
0,13300000,7420,4,2,3,2,yes,furnished,1,0,0,0,1
1,12250000,8960,4,4,4,3,no,furnished,1,0,0,0,1
2,12250000,9960,3,2,2,2,yes,semi-furnished,1,0,1,0,0
3,12215000,7500,4,2,2,3,yes,furnished,1,0,1,0,1
4,11410000,7420,4,1,2,2,no,furnished,1,1,1,0,1


In [22]:
# Handling prefarea
prefarea_dummies=pd.get_dummies(data['prefarea'],drop_first=True)
prefarea_dummies.head()

Unnamed: 0,yes
0,1
1,0
2,1
3,1
4,0


In [23]:
prefarea_dummies.rename(columns = {'yes':'prefarea'}, inplace = True)
data.drop(columns=['prefarea'],inplace=True)
data=pd.concat([data,prefarea_dummies],axis='columns')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,furnishingstatus,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea
0,13300000,7420,4,2,3,2,furnished,1,0,0,0,1,1
1,12250000,8960,4,4,4,3,furnished,1,0,0,0,1,0
2,12250000,9960,3,2,2,2,semi-furnished,1,0,1,0,0,1
3,12215000,7500,4,2,2,3,furnished,1,0,1,0,1,1
4,11410000,7420,4,1,2,2,furnished,1,1,1,0,1,0


In [24]:
# Handling furnishingstatus
furnishingstatus_dummies=pd.get_dummies(data['furnishingstatus'],drop_first=True)
furnishingstatus_dummies.head()

Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [25]:
data.drop(columns=['furnishingstatus'],inplace=True)
data=pd.concat([data,furnishingstatus_dummies],axis='columns')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1,0
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0,0


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   price            545 non-null    int64
 1   area             545 non-null    int64
 2   bedrooms         545 non-null    int64
 3   bathrooms        545 non-null    int64
 4   stories          545 non-null    int64
 5   parking          545 non-null    int64
 6   mainroad         545 non-null    uint8
 7   guestroom        545 non-null    uint8
 8   basement         545 non-null    uint8
 9   hotwaterheating  545 non-null    uint8
 10  airconditioning  545 non-null    uint8
 11  prefarea         545 non-null    uint8
 12  semi-furnished   545 non-null    uint8
 13  unfurnished      545 non-null    uint8
dtypes: int64(6), uint8(8)
memory usage: 29.9 KB


In [27]:
data.to_csv('cleaned_data.csv')

### Splitting the data

In [28]:
x=data.drop(columns=['price'])
y=data['price']

In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=34)
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((436, 13), (436,), (109, 13), (109,))

In [30]:
# Scaling the data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train_scaled=sc.fit_transform(x_train)
x_test_scaled=sc.transform(x_test)

## Bulilding the model

In [31]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(x_train_scaled,y_train)

In [32]:
y_pred=lr.predict(x_test_scaled)

### Evaluating the model

In [33]:
from sklearn.metrics import r2_score
r2_score(y_pred,y_test)

0.4402540459254831