In [1]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
train_data = pd.read_csv('./train_cleaned.csv')
test_data = pd.read_csv('./test_cleaned.csv')

y_train = train_data.loc[:,'price']
train_data = train_data.drop(columns='price')
num_features = len(train_data.columns)
features = list(train_data.columns[2:num_features])
X_train = train_data.loc[:,features]

features = list(test_data.columns[2:num_features])

X_test = test_data.loc[:,features]
test_labels = test_data.loc[:,'id']

test_labels = test_labels.to_frame()

In [3]:
def run_model(X_train, y_train, learn, max_depth, num_class, num_leaves, min_data_in_leaf, rounds):
    params={}
    params['learning_rate']=0.03
    params['boosting_type']='gbdt' #GradientBoostingDecisionTree
    params['objective']='multiclass' #Multi-class target feature
    params['metric']='multi_error' #metric for multi-class
    params['max_depth']=max_depth
    params['num_class']=num_class
    params['num_leaves']=num_leaves
    clf=lgb.train(params,d_train,rounds)

In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
param_grid = {
    'n_estimators': [100, 500, 1000, 1500, 2000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [10, 15,20,25],
    'num_leaves': [10, 20, 30, 40, 50],
    'reg_lambda': [0.2, 0.5, 0.8, 1, 1.1, 1.2],
    'reg_alpha' : [0.2, 0.5, 0.8, 1, 1.1, 1.2], 
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9, 1],
    'subsample_freq': [20], 
    'min_data_in_leaf': [15, 25, 30, 40],
    'learning_rate': [0.005, 0.01, 0.05, 0.1]
}
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='multiclass', metric='multi_error')
rsearch = RandomizedSearchCV(estimator=lgb_estimator, param_distributions=param_grid, cv=kf, n_iter = 100, n_jobs = -1, verbose=10)
lgb_model = rsearch.fit(X=X, y=y)


print(lgb_model.best_params_, lgb_model.best_score_)


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  

{'subsample_freq': 20, 'subsample': 1, 'reg_lambda': 0.2, 'reg_alpha': 1.1, 'num_leaves': 50, 'n_estimators': 1500, 'min_split_gain': 0.4, 'min_data_in_leaf': 15, 'max_depth': 15, 'learning_rate': 0.01, 'colsample_bytree': 0.7} 0.5422979087241682


In [7]:
print(lgb_model.best_params_, lgb_model.best_score_)
#RANDOM SEARCH WITH 1000 CANDIDATES

{'subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'num_leaves': 50, 'n_estimators': 1000, 'min_split_gain': 0.4, 'min_data_in_leaf': 15, 'max_depth': 25, 'learning_rate': 0.01, 'colsample_bytree': 0.8} 0.5570704227754608


In [5]:
print(lgb_model.best_params_, lgb_model.best_score_)
#THIS IS FOR THE RANDOM SEARCH WITH 100 CANDIATES

{'subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.1, 'reg_alpha': 0.8, 'num_leaves': 40, 'n_estimators': 1500, 'min_split_gain': 0.4, 'min_data_in_leaf': 15, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 0.7} 0.559755840135097


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
param_grid = {
    'n_estimators': [100, 500, 1000, 1500, 2000, 2500, 3000],
    'colsample_bytree': [0.7],
    'max_depth': [10, 15,20,25],
    'num_leaves': [ 30],
    'reg_lambda': [ 1.1, 1, 1.2],
    'reg_alpha' : [ 0.8], 
    'min_split_gain': [0.4],
    'subsample': [ 0.9, 1],
    'subsample_freq': [20], 
    'min_data_in_leaf': [15, 10, 20],
    'learning_rate': [0.01, 0.005]
}
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='multiclass', metric='multi_error')
rsearch = RandomizedSearchCV(estimator=lgb_estimator, param_distributions=param_grid, cv=kf, n_iter = 1000, n_jobs = -1, verbose=10)
lgb_model = rsearch.fit(X=X, y=y)


print(lgb_model.best_params_, lgb_model.best_score_)


In [None]:
print(lgb_model.best_params_, lgb_model.best_score_)


In [9]:
from sklearn.metrics import accuracy_score
import lightgbm as lgb
kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)

for train, test in kf.split(X_train):
    X_train_kf, X_test_kf = X[train],X[test]
    y_train_kf, y_test_kf = y[train],y[test]
    d_train=lgb.Dataset(X_train_kf, label=y_train_kf)
    best_param = {'verbose':-1,'boosting_type':'gbdt',  'objective':'multiclass', 'num_class':5, 'metric':'multi_error','subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.1, 'reg_alpha': 0.8, 'num_leaves': 40, 'n_estimators': 1500, 'min_split_gain': 0.4, 'min_data_in_leaf': 15, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
    clf=lgb.train(best_param,d_train )
    
    y_pred=clf.predict(X_test_kf)
    y_pred = [np.argmax(line) for line in y_pred]
    print(accuracy_score(y_test_kf, y_pred))



0.5748194014447885
0.5382231404958677
0.5547520661157025
0.5547520661157025
0.5537190082644629
0.5516528925619835
0.5692148760330579
0.5547520661157025
0.5351239669421488
0.5805785123966942


In [10]:
from sklearn.metrics import accuracy_score
import lightgbm as lgb
kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)

for train, test in kf.split(X_train):
    X_train_kf, X_test_kf = X[train],X[test]
    y_train_kf, y_test_kf = y[train],y[test]
    d_train=lgb.Dataset(X_train_kf, label=y_train_kf)
    best_param = {'verbose':-1,'boosting_type':'gbdt',  'objective':'multiclass', 'num_class':5, 'metric':'multi_error','subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'num_leaves': 50, 'n_estimators': 1000, 'min_split_gain': 0.4, 'min_data_in_leaf': 15, 'max_depth': 25, 'learning_rate': 0.01, 'colsample_bytree': 0.8}
    clf=lgb.train(best_param,d_train )
    
    y_pred=clf.predict(X_test_kf)
    y_pred = [np.argmax(line) for line in y_pred]
    print(accuracy_score(y_test_kf, y_pred))

0.56656346749226
0.5402892561983471
0.5702479338842975
0.5557851239669421
0.5867768595041323
0.5733471074380165
0.5464876033057852
0.5175619834710744
0.5568181818181818
0.5568181818181818


In [None]:
from sklearn.metrics import accuracy_score
import lightgbm as lgb
kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)

for train, test in kf.split(X_train):
    X_train_kf, X_test_kf = X[train],X[test]
    y_train_kf, y_test_kf = y[train],y[test]
    d_train=lgb.Dataset(X_train_kf, label=y_train_kf)
    best_param = {'verbose':-1,'boosting_type':'gbdt',  'objective':'multiclass', 'learning_rate':0.01, 'num_class':5, 'metric':'multi_error','subsample_freq': 20, 'subsample': 0.8, 'reg_lambda': 1.2, 'num_leaves': 40, 'n_estimators': 1000, 'min_split_gain': 0.4, 'min_data_in_leaf': 25, 'max_depth': 20, 'colsample_bytree': 0.7}
    clf=lgb.train(best_param,d_train )
    
    y_pred=clf.predict(X_test_kf)
    y_pred = [np.argmax(line) for line in y_pred]
    print(accuracy_score(y_test_kf, y_pred))

In [11]:
X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
best_param = {'verbose':-1,'boosting_type':'gbdt',  'objective':'multiclass', 'num_class':5, 'metric':'multi_error','subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.1, 'reg_alpha': 0.8, 'num_leaves': 40, 'n_estimators': 1500, 'min_split_gain': 0.4, 'min_data_in_leaf': 15, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 0.7}

best_param1 = {'boosting_type':'gbdt',  'objective':'multiclass', 'learning_rate':0.01, 'num_class':5, 'metric':'multi_error','subsample_freq': 20, 'subsample': 0.8, 'reg_lambda': 1.2, 'num_leaves': 40, 'n_estimators': 1000, 'min_split_gain': 0.4, 'min_data_in_leaf': 25, 'max_depth': 20, 'colsample_bytree': 0.7}
d_train=lgb.Dataset(X, y)
clf=lgb.train(best_param,d_train)

In [12]:
y_pred1=clf.predict(X)
y_pred1 = [np.argmax(line) for line in y_pred1]
print(accuracy_score(y, y_pred1))

0.8132424336328892


In [13]:
y_pred=clf.predict(X_test)

In [14]:
print(y_pred)

[[9.69544159e-16 1.99476780e-01 5.00940067e-01 2.05434926e-01
  9.41482270e-02]
 [9.15032637e-16 6.63780722e-02 3.44014882e-01 4.85998252e-01
  1.03608794e-01]
 [9.45876002e-16 2.22367091e-01 4.66117303e-01 2.60199924e-01
  5.13156815e-02]
 ...
 [9.97185634e-16 8.29134740e-02 2.42564909e-01 4.12964599e-01
  2.61557017e-01]
 [4.13292006e-16 8.93739245e-01 6.12779253e-02 3.16651961e-02
  1.33176333e-02]
 [1.32725524e-15 2.09437907e-01 3.62622609e-01 3.03283919e-01
  1.24655565e-01]]


In [15]:
y_pred = [np.argmax(line) for line in y_pred]

In [16]:
print(y_pred)

[2, 3, 2, 2, 1, 4, 4, 1, 3, 4, 1, 1, 1, 3, 1, 1, 1, 2, 2, 4, 3, 2, 2, 4, 2, 4, 4, 4, 1, 4, 3, 3, 1, 1, 3, 4, 4, 1, 1, 1, 4, 4, 4, 1, 4, 4, 4, 1, 2, 2, 2, 3, 1, 1, 1, 4, 1, 4, 2, 2, 2, 4, 1, 3, 1, 4, 1, 4, 2, 3, 4, 3, 2, 3, 3, 2, 4, 3, 1, 1, 2, 3, 1, 3, 2, 4, 1, 3, 1, 1, 2, 2, 3, 2, 1, 4, 4, 2, 2, 2, 1, 1, 2, 4, 1, 1, 1, 3, 1, 1, 3, 1, 4, 3, 3, 3, 4, 4, 3, 1, 1, 4, 2, 1, 2, 3, 3, 1, 1, 4, 3, 3, 3, 3, 1, 4, 3, 4, 4, 2, 4, 3, 4, 4, 4, 3, 4, 4, 2, 4, 1, 1, 4, 2, 1, 2, 4, 3, 4, 1, 3, 2, 3, 4, 3, 3, 2, 4, 1, 2, 4, 1, 1, 4, 1, 2, 2, 4, 4, 3, 2, 4, 4, 2, 3, 3, 1, 3, 3, 3, 3, 2, 1, 3, 1, 1, 3, 1, 1, 1, 4, 3, 1, 1, 2, 3, 2, 3, 4, 3, 2, 1, 2, 2, 4, 3, 3, 3, 3, 4, 2, 4, 3, 1, 1, 3, 3, 4, 2, 4, 3, 4, 3, 3, 3, 4, 2, 3, 4, 1, 2, 1, 2, 2, 4, 2, 1, 2, 4, 4, 2, 1, 1, 1, 4, 1, 2, 2, 2, 4, 3, 2, 3, 1, 3, 1, 1, 1, 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 2, 2, 3, 3, 2, 4, 3, 3, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 4, 4, 3, 4, 1, 2, 3, 1, 1, 3, 3, 4, 2, 4, 1, 3, 4, 4, 1, 3, 4, 1, 2, 2, 2, 1, 3, 1, 1, 4, 2, 4, 3, 3, 

In [17]:
df = pd.DataFrame
test_labels['price'] = y_pred

test_labels['price'] = test_labels['price'].map(lambda x: int(x))

In [18]:
test_labels

Unnamed: 0,id,price
0,7715,2
1,13196,3
2,13194,2
3,4673,2
4,11325,1
...,...,...
4144,12921,4
4145,7174,1
4146,9240,3
4147,11663,1


In [19]:
test_labels.to_csv(r'./submission9.csv', index=False)

In [None]:
from sklearn.model_selection import GridSearchCV

kf = KFold(n_splits = 10, shuffle=True)


X = np.concatenate([X_train], axis= 0)
y = np.concatenate([y_train], axis= 0)
param_grid = {
    'n_estimators': [100, 400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [8,10,20],
    'num_leaves': [15, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}
lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='multiclass', learning_rate=0.01, metric='multi_error')
gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid, cv=kf, verbose=2, n_jobs=-1)
lgb_model = gsearch.fit(X=X, y=y)


print(lgb_model.best_params_, lgb_model.best_score_)
