In [22]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [8]:
data = fetch_california_housing()

df = pd.DataFrame(data.data, columns=data.feature_names) \
    .assign(MedHouseVal=data.target)
            
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [9]:
feature_cols = data.feature_names
target_col = 'MedHouseVal'

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    df[feature_cols], df[target_col], test_size=0.3, random_state=0
)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((14448, 8), (6192, 8), (14448,), (6192,))

In [19]:
params = {
    'X': x_train,
    'y': y_train,
    'eval_set': (x_test, y_test),
    'early_stopping_rounds': 50,
    'verbose': 1,
    'eval_metric': 'l2'
}

model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators=1000,
    importance_type='gain',
    random_state=0
).fit(**params)



[1]	valid_0's l2: 1.17316
[2]	valid_0's l2: 1.04071
[3]	valid_0's l2: 0.930806
[4]	valid_0's l2: 0.839008
[5]	valid_0's l2: 0.75902
[6]	valid_0's l2: 0.691133
[7]	valid_0's l2: 0.634813
[8]	valid_0's l2: 0.586905
[9]	valid_0's l2: 0.545603
[10]	valid_0's l2: 0.513092
[11]	valid_0's l2: 0.483271
[12]	valid_0's l2: 0.458585
[13]	valid_0's l2: 0.43725
[14]	valid_0's l2: 0.419599
[15]	valid_0's l2: 0.402946
[16]	valid_0's l2: 0.389476
[17]	valid_0's l2: 0.377808
[18]	valid_0's l2: 0.363984
[19]	valid_0's l2: 0.355093
[20]	valid_0's l2: 0.348315
[21]	valid_0's l2: 0.337366
[22]	valid_0's l2: 0.328551
[23]	valid_0's l2: 0.322994
[24]	valid_0's l2: 0.315415
[25]	valid_0's l2: 0.309296
[26]	valid_0's l2: 0.302904
[27]	valid_0's l2: 0.295839
[28]	valid_0's l2: 0.28996
[29]	valid_0's l2: 0.286476
[30]	valid_0's l2: 0.282586
[31]	valid_0's l2: 0.278779
[32]	valid_0's l2: 0.276506
[33]	valid_0's l2: 0.273593
[34]	valid_0's l2: 0.270231
[35]	valid_0's l2: 0.268745
[36]	valid_0's l2: 0.266182
[37]	v

In [21]:
df_importance = pd.DataFrame({
    'feature': feature_cols,
    'gain': model.feature_importances_
}).sort_values('gain', ascending=False)

df_importance

Unnamed: 0,feature,gain
0,MedInc,51570.542924
5,AveOccup,12306.917106
7,Longitude,11006.996057
6,Latitude,10759.996946
1,HouseAge,4437.360295
2,AveRooms,3125.337069
3,AveBedrms,1569.160782
4,Population,1399.797162


In [24]:
with open('model/model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [26]:
x_train.to_csv('data/x_train.csv', index=False)
x_test.to_csv('data/x_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)