In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Merge test/train datasets into a single one and separate unneeded columns
target = train_df.pop('target')
len_train = len(train_df)
merged_df = pd.concat([train_df, test_df])
#ID = merged_df.pop('ID_code')[len_train:]

In [5]:
merged_df.pop('ID_code')[len_train:]

0              test_0
1              test_1
2              test_2
3              test_3
4              test_4
5              test_5
6              test_6
7              test_7
8              test_8
9              test_9
10            test_10
11            test_11
12            test_12
13            test_13
14            test_14
15            test_15
16            test_16
17            test_17
18            test_18
19            test_19
20            test_20
21            test_21
22            test_22
23            test_23
24            test_24
25            test_25
26            test_26
27            test_27
28            test_28
29            test_29
             ...     
199970    test_199970
199971    test_199971
199972    test_199972
199973    test_199973
199974    test_199974
199975    test_199975
199976    test_199976
199977    test_199977
199978    test_199978
199979    test_199979
199980    test_199980
199981    test_199981
199982    test_199982
199983    test_199983
199984    

In [None]:
# Use lightgbm for prediction
# Assume all features are independent, so fit model to one feature at a time
# Then final prediction is a product of all predictions based on a single feature
# Since data contains only one feature, do not use CV - just used fixed number of iterations
params = {
    'task': 'train', 'max_depth': 1, 'boosting_type': 'gbdt',
    'objective': 'binary', 'num_leaves': 3, 'learning_rate': 0.1,
    'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5,
    'lambda_l1': 1, 'lambda_l2': 60, 'verbose': -99
}
num_runs = merged_df.shape[1]
sub_preds = np.zeros([num_runs, merged_df.shape[0]-len_train])
for run in range(num_runs): # loop over all features
    lgb_train = lgb.Dataset(merged_df.iloc[:len_train, run:run+1], target)
    gbm = lgb.train(params, lgb_train, 45, verbose_eval=1000)
    sub_preds[run, :] = gbm.predict(merged_df.iloc[len_train:, run:run+1], num_iteration=gbm.best_iteration)

In [None]:
# Scale prediction by inverse average target - to avoid very small numbers
# Then multiply them for all features and write submission file
sub_preds2 = (10 * sub_preds).prod(axis=0)
out_df = pd.DataFrame({'ID_code': ID, 'target': sub_preds2.astype('float32')})
out_df.to_csv('sub1f.csv', index=False)