In [1]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
train_data = pd.read_csv('./train_noclean.csv')
test_data = pd.read_csv('./test_noclean.csv')

y_train = train_data.loc[:,'price']
train_data = train_data.drop(columns='price')
num_features = len(train_data.columns)
features = list(train_data.columns[2:num_features])
X_train = train_data.loc[:,features]

features = list(test_data.columns[2:num_features])

X_test = test_data.loc[:,features]
test_labels = test_data.loc[:,'id']

test_labels = test_labels.to_frame()

In [3]:
def run_model(X_train, y_train, learn, max_depth, num_class, num_leaves, min_data_in_leaf, rounds):
    params={}
    params['learning_rate']=0.03
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='multiclass' #Multi-class target feature
    params['metric']='multi_error' #metric for multi-class
    params['max_depth']=max_depth
    params['num_class']=num_class
    params['num_leaves']=num_leaves
    clf=lgb.train(params,d_train,rounds)

In [18]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
param_grid = {
    'n_estimators': [100, 400, 700, 1000, 1500],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [10, 20, 30, 40, 50],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20], 
    'min_data_in_leaf': [15, 20, 25]
}
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='multiclass', learning_rate=0.01, metric='multi_error')
rsearch = RandomizedSearchCV(estimator=lgb_estimator, param_distributions=param_grid, cv=kf, n_iter = 100, n_jobs = -1, verbose=10)
lgb_model = rsearch.fit(X=X, y=y)


print(lgb_model.best_params_, lgb_model.best_score_)


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  

{'subsample_freq': 20, 'subsample': 0.8, 'reg_lambda': 1.2, 'num_leaves': 40, 'n_estimators': 1000, 'min_split_gain': 0.4, 'min_data_in_leaf': 25, 'max_depth': 20, 'colsample_bytree': 0.7} 0.5584111591570078


In [9]:
X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)

best_param = {'boosting_type':'gbdt',  'objective':'multiclass', 'learning_rate':0.01, 'num_class':5, 'metric':'multi_error', 'subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.3, 'reg_alpha': 1.1, 'num_leaves': 50, 'n_estimators': 1000, 'min_split_gain': 0.4, 'max_depth': 20, 'colsample_bytree': 0.7}
d_train=lgb.Dataset(X, y)
clf=lgb.train(best_param,d_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1523
[LightGBM] [Info] Number of data points in the train set: 9681, number of used features: 52
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -1.385985
[LightGBM] [Info] Start training from score -1.388052
[LightGBM] [Info] Start training from score -1.367568
[LightGBM] [Info] Start training from score -1.403905


In [10]:
y_pred=clf.predict(X_test)

In [11]:
print(y_pred)

[[8.35791440e-16 2.13886952e-01 4.99848330e-01 2.26964517e-01
  5.93002007e-02]
 [9.31471448e-16 7.08360458e-02 3.72230852e-01 4.57950512e-01
  9.89825902e-02]
 [9.10467265e-16 2.22740962e-01 4.84204210e-01 2.30319129e-01
  6.27356993e-02]
 ...
 [9.70021020e-16 7.01547876e-02 2.15128303e-01 4.66786774e-01
  2.47930136e-01]
 [4.56829401e-16 8.89914281e-01 5.73417468e-02 3.58941133e-02
  1.68498590e-02]
 [1.32202893e-15 1.84832347e-01 3.96798027e-01 2.74154632e-01
  1.44214994e-01]]


In [12]:
y_pred = [np.argmax(line) for line in y_pred]

In [13]:
print(y_pred)

[2, 3, 2, 2, 2, 4, 4, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 2, 2, 4, 3, 2, 3, 4, 2, 4, 4, 4, 1, 4, 3, 3, 1, 1, 3, 4, 4, 1, 1, 1, 4, 4, 4, 1, 4, 4, 4, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 4, 2, 2, 2, 4, 1, 3, 1, 4, 1, 4, 4, 3, 4, 3, 2, 3, 3, 2, 4, 3, 1, 1, 2, 3, 1, 3, 2, 4, 1, 3, 1, 1, 2, 4, 3, 2, 1, 4, 4, 2, 3, 2, 1, 1, 2, 4, 1, 1, 1, 3, 1, 1, 3, 1, 4, 3, 3, 3, 4, 4, 3, 1, 1, 4, 2, 1, 2, 3, 3, 1, 1, 4, 3, 2, 3, 3, 1, 4, 3, 4, 4, 2, 4, 3, 4, 4, 4, 3, 4, 4, 2, 4, 1, 1, 4, 2, 3, 2, 3, 3, 4, 1, 3, 2, 3, 4, 4, 2, 2, 4, 1, 1, 4, 1, 1, 4, 1, 2, 2, 4, 4, 3, 2, 4, 4, 2, 3, 3, 1, 4, 2, 3, 3, 2, 1, 3, 1, 1, 3, 1, 1, 1, 4, 3, 1, 1, 2, 3, 2, 3, 4, 3, 2, 1, 2, 2, 4, 3, 3, 3, 3, 4, 2, 4, 2, 1, 1, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 4, 2, 3, 4, 1, 2, 1, 3, 2, 4, 2, 1, 2, 4, 4, 2, 1, 1, 1, 4, 1, 2, 2, 2, 4, 3, 2, 3, 1, 2, 1, 1, 1, 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 2, 2, 3, 4, 2, 2, 3, 3, 2, 3, 2, 3, 1, 2, 3, 1, 3, 2, 1, 4, 4, 3, 4, 1, 2, 3, 1, 1, 3, 3, 4, 2, 4, 1, 3, 4, 4, 1, 3, 4, 1, 2, 2, 2, 1, 3, 1, 1, 4, 2, 4, 3, 3, 

In [14]:
df = pd.DataFrame
test_labels['price'] = y_pred

test_labels['price'] = test_labels['price'].map(lambda x: int(x))

In [15]:
test_labels

Unnamed: 0,id,price
0,7715,2
1,13196,3
2,13194,2
3,4673,2
4,11325,2
...,...,...
4144,12921,2
4145,7174,1
4146,9240,3
4147,11663,1


In [16]:
test_labels.to_csv(r'./submission7.csv', index=False)

In [17]:
from sklearn.model_selection import GridSearchCV

kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
param_grid = {
    'n_estimators': [100, 400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [8,10,20],
    'num_leaves': [15, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='multiclass', learning_rate=0.01, metric='multi_error')
gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid, cv=kf, verbose=2, n_jobs=-1)
lgb_model = gsearch.fit(X=X, y=y)


print(lgb_model.best_params_, lgb_model.best_score_)


Fitting 10 folds for each of 3888 candidates, totalling 38880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed:  1.3min


KeyboardInterrupt: 