In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [28]:
# load or create your dataset
print('Load data...')
df = pd.read_csv('../input/PASEC2015Cleaned.csv', low_memory=False);

Load data...


In [29]:
df.replace('\s+', '',regex=True,inplace=True)

In [30]:
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [31]:
for column in df.columns:
    df[column].fillna((df[column].mean()), inplace=True)

In [32]:
Y = df['MATHS_PV1']
X = df[df.columns.difference(['LECT_PV1', 'MATHS_PV1'])]

In [33]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, shuffle=True)

In [37]:
# train
'''
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=255,
                        learning_rate=0.05,
                        n_estimators=20)
'''

gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=255,
                        num_trees = 500,
                        min_sum_hessian_in_leaf = 100,
                        learning_rate=0.05)

In [38]:
print('Start training...')
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l2',
        early_stopping_rounds=6)

Start training...
[1]	valid_0's l2: 6218.26
Training until validation scores don't improve for 6 rounds.
[2]	valid_0's l2: 5924.98
[3]	valid_0's l2: 5664.62
[4]	valid_0's l2: 5426.91
[5]	valid_0's l2: 5199.06
[6]	valid_0's l2: 5009.25
[7]	valid_0's l2: 4822.64
[8]	valid_0's l2: 4662.65
[9]	valid_0's l2: 4516.85




[10]	valid_0's l2: 4380.28
[11]	valid_0's l2: 4245.23
[12]	valid_0's l2: 4134.23
[13]	valid_0's l2: 4018.64
[14]	valid_0's l2: 3920.07
[15]	valid_0's l2: 3833.03
[16]	valid_0's l2: 3747.66
[17]	valid_0's l2: 3671.4
[18]	valid_0's l2: 3604.07
[19]	valid_0's l2: 3538.59
[20]	valid_0's l2: 3484.73
[21]	valid_0's l2: 3421.48
[22]	valid_0's l2: 3371.69
[23]	valid_0's l2: 3319.9
[24]	valid_0's l2: 3274.37
[25]	valid_0's l2: 3227.24
[26]	valid_0's l2: 3188.47
[27]	valid_0's l2: 3158.19
[28]	valid_0's l2: 3124.88
[29]	valid_0's l2: 3095.56
[30]	valid_0's l2: 3064.24
[31]	valid_0's l2: 3036.82
[32]	valid_0's l2: 3003.56
[33]	valid_0's l2: 2981.32
[34]	valid_0's l2: 2960.82
[35]	valid_0's l2: 2935.86
[36]	valid_0's l2: 2924.21
[37]	valid_0's l2: 2906.54
[38]	valid_0's l2: 2897.76
[39]	valid_0's l2: 2877.7
[40]	valid_0's l2: 2865.32
[41]	valid_0's l2: 2850.26
[42]	valid_0's l2: 2840.24
[43]	valid_0's l2: 2828.56
[44]	valid_0's l2: 2816.43
[45]	valid_0's l2: 2804.42
[46]	valid_0's l2: 2792.01
[47]

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.05,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0.0, min_sum_hessian_in_leaf=100, n_estimators=10,
       n_jobs=-1, num_leaves=255, num_trees=500, objective='regression',
       random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=50000, subsample_freq=1)

In [40]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The RMSE of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
#print('Feature importances:', list(gbm.feature_importances_))

Start predicting...
The RMSE of prediction is: 48.492706617


In [41]:
# Finding the the best parameters
estimator = lgb.LGBMRegressor()

param_grid = {
    'learning_rate': [0.01, 0.1, 0.05],
    'n_estimators': [20, 40, 50, 100],
    'num_leaves': [100, 200, 1250]
}

gbm = GridSearchCV(estimator, param_grid)

gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.05, 'n_estimators': 50, 'num_leaves': 100}
