In [2]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split

In [3]:
train_path = '../DATA/train_normalized_4labels.csv'
test_path = '../DATA/test_normalized_4labels.csv'

In [4]:
raw_data = pd.read_csv(train_path,index_col=0)

  mask |= (ar1 == a)


In [5]:
data = pd.DataFrame()
n_sample = 24748
# 'CERTIFIED': 1502699, 'CERTIFIED-WITHDRAWN': 122582, 'WITHDRAWN': 54542, 'DENIED': 24748
for c in ['CERTIFIED', 'CERTIFIED-WITHDRAWN', 'DENIED', 'WITHDRAWN']:
    T = raw_data[raw_data['CASE_STATUS']==c].sample(n_sample)
    data = data.append(T)

In [6]:
X = data.drop(columns = ['EMPLOYER_NAME','CASE_STATUS']).values
y = data['CASE_STATUS'].values
T = {'CERTIFIED':0, 'CERTIFIED-WITHDRAWN':1, 'DENIED':2, 'WITHDRAWN':3}
y = list(map(lambda x:T[x],y))

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.05, random_state=42)

In [8]:
train_data = lightgbm.Dataset(X_train, label=y_train)
valid_data = lightgbm.Dataset(X_valid, label=y_valid)

In [9]:
parameters = {
    'objective': 'multiclass',
    'metric': 'multi_error',
    'is_unbalance': 'false',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0,
    'num_class':4,
    'deterministic':'true'
}

In [10]:
model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=valid_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's multi_error: 0.195758
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_error: 0.179798
[3]	valid_0's multi_error: 0.153535
[4]	valid_0's multi_error: 0.14
[5]	valid_0's multi_error: 0.157778
[6]	valid_0's multi_error: 0.161818
[7]	valid_0's multi_error: 0.159798
[8]	valid_0's multi_error: 0.144242
[9]	valid_0's multi_error: 0.131515
[10]	valid_0's multi_error: 0.131111
[11]	valid_0's multi_error: 0.135758
[12]	valid_0's multi_error: 0.133535
[13]	valid_0's multi_error: 0.130101
[14]	valid_0's multi_error: 0.124444
[15]	valid_0's multi_error: 0.124646
[16]	valid_0's multi_error: 0.125657
[17]	valid_0's multi_error: 0.122828
[18]	valid_0's multi_error: 0.122424
[19]	valid_0's multi_error: 0.122626
[20]	valid_0's multi_error: 0.122828
[21]	valid_0's multi_error: 0.122828
[22]	valid_0's multi_error: 0.12101
[23]	valid_0's multi_error: 0.119192
[24]	valid_0's multi_error: 0.120404
[25]	v

In [None]:
def calculate_acc(y_test,y_pred):
    labels = [[0,0],[0,0],[0,0],[0,0]] #truth, correct
    for y,yp in zip(y_test,y_pred):
        labels[y][0] += 1
        if y == yp:
            labels[y][1] += 1
    for stat in range(4):
        labels[stat] = labels[stat][1]/labels[stat][0]
    return labels

In [None]:
y_pred = model.predict(X_train).argmax(axis=1)
print("Accuracy on training set:\n",list(zip(['CERTIFIED', 'CERTIFIED-WITHDRAWN', 'DENIED', 'WITHDRAWN'],
               calculate_acc(y_train,y_pred))))

Accuracy on training set:
 [('CERTIFIED', 0.9269186517236971), ('CERTIFIED-WITHDRAWN', 0.9612376742604556), ('DENIED', 0.8675082887018618), ('WITHDRAWN', 0.8386123038986437)]


In [None]:
y_pred = model.predict(X_valid).argmax(axis=1)
print("Accuracy on validation set:\n",list(zip(['CERTIFIED', 'CERTIFIED-WITHDRAWN', 'DENIED', 'WITHDRAWN'],
               calculate_acc(y_valid,y_pred))))

Accuracy on validation set:
 [('CERTIFIED', 0.9195940671350508), ('CERTIFIED-WITHDRAWN', 0.9573770491803278), ('DENIED', 0.8633387888707038), ('WITHDRAWN', 0.8386308068459658)]


In [15]:
data = pd.read_csv(test_path,index_col=0)
X_test = data.drop(columns = ['EMPLOYER_NAME','CASE_STATUS']).values
y_test = data['CASE_STATUS'].values
del(data)
T = {'CERTIFIED':0, 'CERTIFIED-WITHDRAWN':1, 'DENIED':2, 'WITHDRAWN':3}
y_test = list(map(lambda x:T[x],y_test))

In [16]:
y_pred = model.predict(X_test).argmax(axis=1)
print("Accuracy on test set:\n",list(zip(['CERTIFIED', 'CERTIFIED-WITHDRAWN', 'DENIED', 'WITHDRAWN'],
               calculate_acc(y_test,y_pred))))

Accuracy on test set:
 [('CERTIFIED', 0.9170873349223232), ('CERTIFIED-WITHDRAWN', 0.9511754068716094), ('DENIED', 0.8527248727135583), ('WITHDRAWN', 0.8266096256684492)]


In [18]:
dir(model)

['_Booster__attr',
 '_Booster__boost',
 '_Booster__get_eval_info',
 '_Booster__higher_better_inner_eval',
 '_Booster__init_predictor',
 '_Booster__inner_eval',
 '_Booster__inner_predict',
 '_Booster__inner_predict_buffer',
 '_Booster__is_predicted_cur_iter',
 '_Booster__name_inner_eval',
 '_Booster__need_reload_eval_info',
 '_Booster__num_class',
 '_Booster__num_dataset',
 '_Booster__num_inner_eval',
 '_Booster__set_objective_to_none',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_free_buffer',
 '_to_predictor',
 '_train_data_name',
 'add_valid',
 'attr',
 'best_iteration',
 'best_score',
 

4