In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
df = pd.read_csv(url)
df_back = df

In [3]:
df = df[(df['ocean_proximity'] == '<1H OCEAN') | (df['ocean_proximity'] == 'INLAND')]

df.reset_index(drop=True, inplace=True)

In [4]:
df.total_bedrooms = df.total_bedrooms.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.total_bedrooms = df.total_bedrooms.fillna(0)


In [5]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [6]:
df['median_house_value'] = np.log1p(df['median_house_value'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['median_house_value'] = np.log1p(df['median_house_value'])


In [7]:
seed = 1
np.random.seed(seed)

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

y_train = df_train['median_house_value'].values
y_val = df_val['median_house_value'].values
y_test = df_test['median_house_value'].values


del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [8]:
df_train.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

In [9]:
df_back.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [10]:
train_dicts = df_train.to_dict(orient = 'records')
val_dicts = df_val.to_dict(orient = 'records')

In [11]:
dv = DictVectorizer(sparse = False)

X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)

In [12]:
dt_regres = DecisionTreeRegressor(max_depth=1)
dt_regres.fit(X_train,y_train)

In [13]:
dv.feature_names_

['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=<1H OCEAN',
 'ocean_proximity=INLAND',
 'population',
 'total_bedrooms',
 'total_rooms']

In [14]:
tree_text = export_text(dt_regres, feature_names = dv.feature_names_)

In [15]:
print(tree_text)

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



## Answer1: Ocean proximity

In [16]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

In [17]:
rf.fit(X_train, y_train)

In [18]:
y_pred = rf.predict(X_val)
np.sqrt(mean_squared_error(y_val, y_pred))

0.24472888684076874

## Answer2: 0.245

In [19]:
scores = []

for n in tqdm(range(10, 201, 10)):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    scores.append((n, score))

df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])

  0%|          | 0/20 [00:00<?, ?it/s]

In [20]:

plateau_index = np.where(df_scores['rmse'] == df_scores['rmse'].min())[0][0]

n_estimators_plateau = df_scores.loc[plateau_index, 'n_estimators']

print(f"Answer = {n_estimators_plateau:.3f}")


Answer = 170.000


## Answer3 : 160

In [21]:
scores = []

for d in tqdm([10, 15, 20, 25]):
    rf = RandomForestRegressor(n_estimators=0, max_depth=d, random_state=1, n_jobs=-1, warm_start=True)

    for n in tqdm(range(10, 201, 10)):
        rf.n_estimators = n
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        score = np.sqrt(mean_squared_error(y_val, y_pred))

        scores.append((d, n, score))

columns = ['max_depth', 'n_estimators', 'rmse']
df_scores = pd.DataFrame(scores, columns=columns)

mean_rmse_by_depth = df_scores.groupby('max_depth')['rmse'].mean()


best_max_depth = mean_rmse_by_depth.idxmin()

print(f"The best max_depth is {best_max_depth}")

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

The best max_depth is 25


## Answer4: 25

In [22]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

rf.fit(X_train, y_train)

In [23]:
feature_importances = rf.feature_importances_


In [24]:
df_importances = pd.DataFrame()
df_importances['feature'] = dv.get_feature_names_out()
df_importances['importance'] = rf.feature_importances_
df_importances

Unnamed: 0,feature,importance
0,households,0.014905
1,housing_median_age,0.030043
2,latitude,0.102717
3,longitude,0.085791
4,median_income,0.335507
5,ocean_proximity=<1H OCEAN,0.21882
6,ocean_proximity=INLAND,0.14746
7,population,0.028212
8,total_bedrooms,0.015228
9,total_rooms,0.021319


## Answer 5: median_income

In [25]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [26]:
import xgboost as xgb

In [28]:
features = dv.get_feature_names_out()


In [30]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [31]:
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [33]:
etas = [0.3, 0.1]
rmse_scores = {}

In [35]:
for eta in etas:

    xgb_params = {
        'eta': eta, 
        'max_depth': 6,
        'min_child_weight': 1,

        'objective': 'reg:squarederror',
        'nthread': 8,

        'seed': 1,
        'verbosity': 1,
    }

    num_round = 100
    model = xgb.train(xgb_params, dtrain, num_round, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)


    y_pred = model.predict(dval)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    rmse_scores[eta] = rmse

for eta, rmse in rmse_scores.items():
    print(f"eta = {eta}: RMSE = {rmse:.4f}")

best_eta = min(rmse_scores, key=rmse_scores.get)
print(f"The best eta is {best_eta} with RMSE = {rmse_scores[best_eta]:.4f}")

eta = 0.3: RMSE = 0.2290
eta = 0.1: RMSE = 0.2323
The best eta is 0.3 with RMSE = 0.2290


## Answer 6: 0.3