In [1]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import lightgbm as lgb

In [38]:
train_data = pd.read_csv('./train_noclean.csv')
test_data = pd.read_csv('./test_noclean.csv')

y_train = train_data.loc[:,'price']
train_data = train_data.drop(columns='price')
num_features = len(train_data.columns)
features = list(train_data.columns[2:num_features])
X_train = train_data.loc[:,features]

features = list(test_data.columns[2:num_features])

X_test = test_data.loc[:,features]
test_labels = test_data.loc[:,'id']

test_labels = test_labels.to_frame()

In [3]:
def run_model(X_train, y_train, learn, max_depth, num_class, num_leaves, min_data_in_leaf, rounds):
    params={}
    params['learning_rate']=0.03
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='multiclass' #Multi-class target feature
    params['metric']='multi_error' #metric for multi-class
    params['max_depth']=max_depth
    params['num_class']=num_class
    params['num_leaves']=num_leaves
    clf=lgb.train(params,d_train,rounds)

In [21]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
param_grid = {
    'n_estimators': [100, 400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [30, 50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='multiclass', learning_rate=0.01, metric='multi_error')
rsearch = RandomizedSearchCV(estimator=lgb_estimator, param_distributions=param_grid, cv=kf, n_iter = 20, n_jobs = -1, verbose=10)
lgb_model = rsearch.fit(X=X, y=y)


print(lgb_model.best_params_, lgb_model.best_score_)


Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 190 out of 200 | elapsed:  2.8min remaining:    8.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.9min finished


{'subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.3, 'reg_alpha': 1.1, 'num_leaves': 50, 'n_estimators': 1000, 'min_split_gain': 0.4, 'max_depth': 20, 'colsample_bytree': 0.7} 0.5587244880553351


In [23]:
best_param = {'subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.3, 'reg_alpha': 1.1, 'num_leaves': 50, 'n_estimators': 1000, 'min_split_gain': 0.4, 'max_depth': 20, 'colsample_bytree': 0.7}
d_train=lgb.Dataset(X, y)
clf=lgb.train(best_param,d_train)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1523
[LightGBM] [Info] Number of data points in the train set: 9681, number of used features: 52
[LightGBM] [Info] Start training from score 2.495920


In [39]:
y_pred=clf.predict(X_test)

In [40]:
y_pred = [np.round(line) for line in y_pred]

In [41]:
print(y_pred)

[2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 2.0, 3.0, 2.0, 2.0, 2.0, 1.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 3.0, 2.0, 4.0, 4.0, 3.0, 1.0, 4.0, 3.0, 3.0, 2.0, 2.0, 3.0, 3.0, 4.0, 1.0, 1.0, 1.0, 4.0, 3.0, 3.0, 1.0, 4.0, 4.0, 4.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 1.0, 1.0, 3.0, 2.0, 4.0, 3.0, 2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 3.0, 2.0, 4.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 3.0, 1.0, 2.0, 2.0, 2.0, 1.0, 3.0, 3.0, 3.0, 1.0, 3.0, 1.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.0, 4.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 1.0, 3.0, 2.0, 1.0, 2.0, 3.0, 4.0, 3.0, 3.0, 2.0, 3.0, 4.0, 2.0, 1.0, 2.0, 4.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 4.0, 3.0, 2.0, 2.0, 3.0, 2.0, 4.0, 3.0, 4.0, 3.0, 2.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0, 2.0, 4.0, 2.0, 2.0, 4.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 4.0, 1.0, 2.0, 3.0, 1.0, 1.0, 4.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0,

In [42]:
df = pd.DataFrame
test_labels['price'] = y_pred

test_labels['price'] = test_labels['price'].map(lambda x: int(x))

In [43]:
test_labels

Unnamed: 0,id,price
0,7715,2
1,13196,2
2,13194,2
3,4673,2
4,11325,2
...,...,...
4144,12921,3
4145,7174,1
4146,9240,3
4147,11663,1


In [14]:
from sklearn.model_selection import GridSearchCV

kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
param_grid = {
    'n_estimators': [100, 400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [30, 50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='multiclass', learning_rate=0.01, metric='multi_error')
gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid, cv=kf, verbose=2)
lgb_model = gsearch.fit(X=X, y=y)


print(lgb_model.best_params_, lgb_model.best_score_)


Fitting 10 folds for each of 5184 candidates, totalling 51840 fits
[CV] colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20, total=   0.7s
[CV] colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20 
[CV]  colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20, total=   0.7s
[CV] colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20 
[CV]  colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20, total=   0.6s
[CV] colsample_bytree=0.7, max_depth=15, min_split_gain=0.3, n_estimators=100, num_leaves=30, reg_alpha=1.1, reg_lambda=1.1, subsample=0.7, subsample_freq=20 
[

KeyboardInterrupt: 