In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')

In [4]:
train_df['LatitudeBin'] = pd.cut(train_df['Latitude'], bins=10)
train_df['LongitudeBin'] = pd.cut(train_df['Longitude'], bins=10)
train_df['HouseAgeBin'] = pd.cut(train_df['HouseAge'], bins=10)

In [5]:
train_df.head()

Unnamed: 0,id,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,LatitudeBin,LongitudeBin,HouseAgeBin
0,0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98,"(34.43, 35.37]","(-120.43, -119.45]","(12.0, 17.0]"
1,1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,"(38.19, 39.13]","(-121.41, -120.43]","(12.0, 17.0]"
2,2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,"(34.43, 35.37]","(-121.41, -120.43]","(22.0, 27.0]"
3,3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,"(32.541, 33.49]","(-117.49, -116.51]","(12.0, 17.0]"
4,4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5,"(37.25, 38.19]","(-123.37, -122.39]","(47.0, 52.0]"


In [6]:
# train test split
from sklearn.model_selection import train_test_split

X = train_df.drop(['MedHouseVal'], axis=1)
y = train_df['MedHouseVal']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

X_train.shape, X_test.shape

((29709, 12), (7428, 12))

In [8]:
# mkae_pipline for preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# make pipeline for numerical features
numerical_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

numerical_transformer = make_pipeline(StandardScaler())

# make pipeline for categorical features
categorical_features = ['HouseAgeBin', 'LatitudeBin', 'LongitudeBin']

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))

In [9]:
# make preprocessor
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [10]:
# make pipeline for models random forest, liinear regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_lr = LinearRegression()

model_rf_pipeline = make_pipeline(preprocessor, model_rf)
model_lr_pipeline = make_pipeline(preprocessor, model_lr)

In [11]:
#fit model
model_rf_pipeline.fit(X_train, y_train)

In [12]:
# predict
y_pred = model_rf_pipeline.predict(X_test)

In [13]:
# evaluate RMSE
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

RMSE: 0.5870371583742865


In [14]:
def add_bins(df):
    df['LatitudeBin'] = pd.cut(df['Latitude'], bins=10)
    df['LongitudeBin'] = pd.cut(df['Longitude'], bins=10)
    df['HouseAgeBin'] = pd.cut(df['HouseAge'], bins=10)
    return df

In [15]:
test_df = add_bins(test_df)

In [16]:
# rpredict test_df  
y_pred_test = model_rf_pipeline.predict(test_df)

In [17]:
# evaluate RMSE
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(sample_sub['MedHouseVal'], y_pred_test))
print(f'RMSE: {rmse}')

RMSE: 0.9786110596159873


In [18]:
# fit model_lr
model_lr_pipeline.fit(X_train, y_train)

In [19]:
y_pred = model_lr_pipeline.predict(X_test)

In [20]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

RMSE: 1.5263007312090024


In [21]:
y_pred_test = model_lr_pipeline.predict(test_df)

In [22]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(sample_sub['MedHouseVal'], y_pred_test))
print(f'RMSE: {rmse}')

RMSE: 1.0428249641841212
