### Import required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('house_price.csv')
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


### Data Preprocessing

In [3]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [4]:
df.shape

(20640, 10)

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

In [7]:
df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [8]:
le = LabelEncoder()
le.fit_transform(df['ocean_proximity'])

array([3, 3, 3, ..., 1, 1, 1], shape=(20433,))

In [9]:
df['ocean_proximity'] = le.fit_transform(df['ocean_proximity'])
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,3,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,3,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,3,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,3,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,3,342200


In [10]:
df['ocean_proximity'].unique()

array([3, 0, 1, 4, 2])

### Data Visualization

In [11]:
# sns.heatmap(data=df, annot=True)
# plt.show()

In [12]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(8,5))
# sns.histplot(df['median_house_value'], bins=50, kde=True)
# plt.title('Distribution of House Prices')
# plt.xlabel('House Price')
# plt.ylabel('Frequency')
# plt.show()

### Spliting Data

In [13]:
X = df.drop(columns=['median_house_value'], axis=1)
Y = df['median_house_value']

X.ndim, Y.ndim

(2, 1)

In [14]:
ss = StandardScaler()
ss.fit(X)
X = ss.transform(X)

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)

### Model Training

#### Linear regression

In [16]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

In [17]:
lr.score(X_test, Y_test), lr.score(X_train, Y_train)

(0.6128264137725626, 0.6418344588704121)

In [18]:
print(r2_score(Y_test, lr.predict(X_test)))

print(mean_absolute_error(Y_test, lr.predict(X_test)))

print(mean_squared_error(Y_test, lr.predict(X_test)))

0.6128264137725626
51892.0756501476
5091181653.668236


#### Random Forest

In [19]:
rfr = RandomForestRegressor(max_depth=9, criterion='friedman_mse')
rfr.fit(X_train, Y_train)

In [20]:
rfr.score(X_test, Y_test), rfr.score(X_train, Y_train)

(0.7676382237505259, 0.8394016727826735)

In [21]:
print(r2_score(Y_test, lr.predict(X_test)))

print(mean_absolute_error(Y_test, lr.predict(X_test)))

print(mean_squared_error(Y_test, lr.predict(X_test)))

0.6128264137725626
51892.0756501476
5091181653.668236


---

## Export

In [22]:
import pickle
pickle.dump(rfr, open('model.pkl', 'wb'))

---