In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('./data/final_train.csv.zip', compression='zip')
test = pd.read_csv('./data/final_test.csv.zip', compression='zip')
target = np.load('./data/target.npy')
submission_ids = np.load('./data/submission_ids.npy')

In [10]:
tra_arr = train.values

In [5]:
target = np.log1p(target)
X_train, X_valid, Y_train, Y_valid = train_test_split(train, target, test_size=.15)


train_lgb = lgb.Dataset(data=X_train, label=Y_train)
valid_lgb = lgb.Dataset(data=X_valid, label=Y_valid)

In [16]:
evaluation_results = {}

params = {
    'objective': 'regression_l2',
    'metric': 'rmse',
    'num_leaves': 40,
    'learning_rate': 0.005,
    'n_estimators': 10000,
    'num_trees': 10000,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'feature_fraction': 0.7,
    'max_depth': 20,
    'max_bin': 1200,
    'verbosity': -1,
    'nthread': 8
    }
params2 = {'learning_rate': 0.02,
           'max_depth': 7, 
           'num_trees': 10000,
           'boosting': 'gbdt',
           'objective': 'regression', 
           'metric': 'rmse',
           'is_training_metric': True}

model = lgb.train(
    params2,
    train_set=train_lgb,
    num_boost_round=1000,
    valid_sets=valid_lgb,
    early_stopping_rounds=3000,
    evals_result=evaluation_results,
    verbose_eval=200,
    )



Training until validation scores don't improve for 3000 rounds.
[200]	valid_0's rmse: 1.46138
[400]	valid_0's rmse: 1.46258
[600]	valid_0's rmse: 1.46447
[800]	valid_0's rmse: 1.46938
[1000]	valid_0's rmse: 1.47322
[1200]	valid_0's rmse: 1.47897
[1400]	valid_0's rmse: 1.48256
[1600]	valid_0's rmse: 1.48779
[1800]	valid_0's rmse: 1.49155
[2000]	valid_0's rmse: 1.49502
[2200]	valid_0's rmse: 1.49771
[2400]	valid_0's rmse: 1.49852
[2600]	valid_0's rmse: 1.50136
[2800]	valid_0's rmse: 1.50285
[3000]	valid_0's rmse: 1.50441
[3200]	valid_0's rmse: 1.50602
Early stopping, best iteration is:
[229]	valid_0's rmse: 1.46064


In [14]:
model_predictions_test = model.predict(test, num_iteration=model.best_iteration)
model_predictions_test

array([15.42172969, 14.42128566, 15.01805572, ..., 14.30287316,
       11.97347652, 13.83798278])

In [24]:
model_predictions_test = np.expm1(model_predictions_test)
model_predictions_test

array([1502852.65510876, 1380918.27307585, 1779900.96795297, ...,
        775817.5862978 , 1399569.78257816, 1392610.00407033])

In [25]:
dataset_submission = pd.DataFrame()
dataset_submission['ID'] = submission_ids
dataset_submission['target'] = model_predictions_test
dataset_submission.to_csv('./data/submission.csv', index=False)

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as rmse

In [18]:
lm = LinearRegression(n_jobs=8)


In [20]:
lm.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=8, normalize=False)

In [28]:
pr = lm.predict(X_valid)
lm.score(X_valid, Y_valid)

-0.013655981952804597

In [23]:
model_predictions_test = lm.predict(test)
model_predictions_test

array([14.2228763 , 14.13825998, 14.39206885, ..., 13.56167399,
       14.15167616, 14.14669096])

In [29]:
rmse(Y_valid, pr)

2.9737467340917365