In [1]:
# ========================================
#               LOAD DATA
# ========================================
import pandas as pd
import lightgbm as lgb

df = pd.read_csv("/users/facsupport/asharma/Data/Preprocessed/tmp/ONE.csv")
# Keep "interesting" jobs (TODO - use strings instead)
df = df[df['jobTitle'].isin([33,34,35,11,12,3,5,16,17])]

In [2]:
# ========================================
#          SPLIT & PREP DATAFRAME
# ========================================

# Drop unnamed column (TODO - export csv without this in the first place)
del df['Unnamed: 0']

# Drop 90th day of facility data
del df['f_89']

inputs = df.drop(['t_89'], axis=1)
labels = df.filter(['t_89'])

# Weights to split data set
TRAINING_WEIGHT = 0.7
VALIDATION_WEIGHT = 0.2
TEST_WEIGHT = 0.1

n = len(df)
weights_sum = TRAINING_WEIGHT + VALIDATION_WEIGHT + TEST_WEIGHT
split1 = int(TRAINING_WEIGHT / weights_sum * n)
split2 = int((TRAINING_WEIGHT + VALIDATION_WEIGHT) / weights_sum * n)

train_inputs, train_labels = inputs[:split1], labels[:split1]
val_inputs, val_labels = inputs[split1:split2], labels[split1:split2]
test_inputs, test_labels = inputs[split2:], labels[split2:]

print(train_inputs)
print(train_labels)

       t_0    t_1   t_2   t_3   t_4   t_5  t_6   t_7   t_8   t_9  ...  \
4      0.0   7.50  7.75  7.75  0.00  0.00  7.5  7.75  7.50  7.75  ...   
5      0.0   8.25  8.25  0.00  0.00  0.00  0.0  0.00  8.25  8.25  ...   
6      8.3  10.53  9.14  0.00  0.00  0.00  0.0  0.00  0.00  0.00  ...   
7      0.0   7.71  7.63  7.65  7.85  0.00  0.0  7.70  7.78  5.35  ...   
8      0.0   0.00  8.25  8.25  8.25  8.25  0.0  8.00  8.00  0.00  ...   
...    ...    ...   ...   ...   ...   ...  ...   ...   ...   ...  ...   
45458  0.0   0.00  0.00  0.00  0.00  0.00  0.0  0.00  0.00  0.00  ...   
45459  0.0   0.00  0.00  0.00  0.00  0.00  0.0  0.00  0.00  0.00  ...   
45464  0.0   0.00  0.00  0.00  0.00  0.00  0.0  0.00  0.00  0.00  ...   
45474  0.0   0.00  0.00  0.00  0.00  0.00  0.0  0.00  0.00  0.00  ...   
45480  0.0   0.00  0.00  0.00  0.00  0.00  0.0  0.00  0.00  0.00  ...   

       providerId  payType  dayOfWeek       Mon       Tue       Wed       Thu  \
4             5.0      3.0        0.0  4.1

In [3]:
# ========================================
#          DATAFRAME TO LGB DS
# ========================================

cats = ['jobTitle', 'providerId', 'payType', 'dayOfWeek']

train_data = lgb.Dataset(train_inputs, label=train_labels, categorical_feature=cats)
val_data = lgb.Dataset(val_inputs, label=val_labels, categorical_feature=cats)
test_data = lgb.Dataset(test_inputs, label=test_labels, categorical_feature=cats)
print(train_data)

<lightgbm.basic.Dataset object at 0x7f69fc447a90>


In [4]:
# ========================================
#            TRAIN WITH LGB
# ========================================
param = {
  'num_leaves': 100,
  'learning_rate': 0.1,
  'metric': 'mse',
}
evals_result = {}
bst = lgb.train(param, train_data, valid_sets=[val_data], evals_result=evals_result)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 46282
[LightGBM] [Info] Number of data points in the train set: 25493, number of used features: 188
[LightGBM] [Info] Start training from score 3.026717




[1]	valid_0's l2: 18.5944
[2]	valid_0's l2: 16.7191
[3]	valid_0's l2: 15.1697
[4]	valid_0's l2: 13.8653
[5]	valid_0's l2: 12.7839
[6]	valid_0's l2: 11.9037
[7]	valid_0's l2: 11.1485
[8]	valid_0's l2: 10.5484
[9]	valid_0's l2: 10.029
[10]	valid_0's l2: 9.58806
[11]	valid_0's l2: 9.21984
[12]	valid_0's l2: 8.91716
[13]	valid_0's l2: 8.68426
[14]	valid_0's l2: 8.47358
[15]	valid_0's l2: 8.23474
[16]	valid_0's l2: 8.08451
[17]	valid_0's l2: 7.92775
[18]	valid_0's l2: 7.75844
[19]	valid_0's l2: 7.65878
[20]	valid_0's l2: 7.4702
[21]	valid_0's l2: 7.37716
[22]	valid_0's l2: 7.32795
[23]	valid_0's l2: 7.24396
[24]	valid_0's l2: 7.2005
[25]	valid_0's l2: 7.13293
[26]	valid_0's l2: 7.08494
[27]	valid_0's l2: 7.04942
[28]	valid_0's l2: 7.01523
[29]	valid_0's l2: 6.94072
[30]	valid_0's l2: 6.85554
[31]	valid_0's l2: 6.75212
[32]	valid_0's l2: 6.71912
[33]	valid_0's l2: 6.68536
[34]	valid_0's l2: 6.66022
[35]	valid_0's l2: 6.59226
[36]	valid_0's l2: 6.55809
[37]	valid_0's l2: 6.53253
[38]	valid_0'