In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np


df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280,565.0,259.0,3.8462,342200.0,0,0,1,0


In [3]:
df.shape

(19675, 13)

### Train and test data preparation

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop('median_house_value', axis=1).values
y = df['median_house_value'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

### Normalising data

In [5]:
from sklearn.preprocessing import MinMaxScaler

ms=MinMaxScaler()
X_train = ms.fit_transform(X_train)
X_test = ms.transform(X_test)

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(13772, 12)
(13772,)
(5903, 12)
(5903,)


### Creating models
In this project, I will compare the performances of multiple linear regression and random forest regression

In [15]:
key = ['MultipleLinearRegression', 'RandomForest']
value = [LinearRegression(), RandomForestRegressor()]
reg = LinearRegression().fit(X_train, y_train)

models = dict(zip(key, value))

In [16]:
models

{'MultipleLinearRegression': LinearRegression(),
 'RandomForest': RandomForestRegressor()}

### Training the models

In [17]:
for key, value in models.items():
    value.fit(X_train,y_train)
    train_pred = value.predict(X_train)
    test_pred = value.predict(X_test)
    RMSE = np.sqrt(mean_squared_error(y_test, test_pred))
    r2score = r2_score(y_test,test_pred)
    print(f"{key}: \n ")
    print(f"RMSE Actual: {int(RMSE)}")
    print(f'r2_score: {round(r2score, 2)}')
    print('\n')

MultipleLinearRegression: 
 
RMSE Actual: 61100
r2_score: 0.61


RandomForest: 
 
RMSE Actual: 44729
r2_score: 0.79




From the results above, we see that random forest regression performs better on this task than multiple linear regression as indicated by the RMSE scores. Therefore we should then optimize the model for random forest.