# Problem Statement: 
- Predict the house price based on age, area of the house etc

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("Airbnbdataset.csv")
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [3]:
data.shape

(20640, 10)

# Observation:
- The dataset has 20640 rows and 10 columns
- Each row represent the data for 1 house

In [4]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = "median")
si

In [8]:
data['total_bedrooms'] = si.fit_transform(data[['total_bedrooms']])
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [9]:
data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [10]:
data = data.drop(['longitude', 'latitude'], axis = 1)
data.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [12]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   housing_median_age  20640 non-null  int64  
 1   total_rooms         20640 non-null  int64  
 2   total_bedrooms      20640 non-null  float64
 3   population          20640 non-null  int64  
 4   households          20640 non-null  int64  
 5   median_income       20640 non-null  float64
 6   median_house_value  20640 non-null  int64  
 7   ocean_proximity     20640 non-null  object 
dtypes: float64(2), int64(5), object(1)
memory usage: 1.3+ MB


In [17]:
cols_to_be_scaled = data.select_dtypes(['int', 'float'])
cols_to_be_scaled.columns.to_list()

['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [21]:
data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [22]:
data_ohe = pd.get_dummies(data['ocean_proximity'])
data_ohe

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
...,...,...,...,...,...
20635,0,1,0,0,0
20636,0,1,0,0,0
20637,0,1,0,0,0
20638,0,1,0,0,0


In [23]:
data = pd.concat([data, data_ohe], axis = 1)
data.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41,880,129.0,322,126,8.3252,452600,NEAR BAY,0,0,0,1,0
1,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY,0,0,0,1,0
2,52,1467,190.0,496,177,7.2574,352100,NEAR BAY,0,0,0,1,0
3,52,1274,235.0,558,219,5.6431,341300,NEAR BAY,0,0,0,1,0
4,52,1627,280.0,565,259,3.8462,342200,NEAR BAY,0,0,0,1,0


In [24]:
data = data.drop('ocean_proximity', axis = 1)
data.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41,880,129.0,322,126,8.3252,452600,0,0,0,1,0
1,21,7099,1106.0,2401,1138,8.3014,358500,0,0,0,1,0
2,52,1467,190.0,496,177,7.2574,352100,0,0,0,1,0
3,52,1274,235.0,558,219,5.6431,341300,0,0,0,1,0
4,52,1627,280.0,565,259,3.8462,342200,0,0,0,1,0


# Seperate X and y

In [25]:
X = data.drop("median_house_value", axis = 1)
y = data['median_house_value']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [27]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [28]:
lr.fit(X_train, y_train)

In [29]:
y_pred = lr.predict(X_test)
y_pred

array([222053.01794996, 296876.0758888 , 174381.47911642, ...,
       110268.26957402, 229329.43340407, 221774.05601418])

In [30]:
lr.intercept_

53818.43664629353

In [31]:
lr.coef_

array([ 1.18818834e+03, -5.83271615e+00,  5.31562949e+01, -3.75192943e+01,
        1.00725059e+02,  3.99980828e+04, -2.65978405e+04, -9.39737174e+04,
        1.55217573e+05, -2.25450172e+04, -1.21009985e+04])

In [32]:
from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test, y_pred)

0.6276733193474345

In [33]:
np.sqrt(mean_squared_error(y_test, y_pred))

69677.62915739177