In [1]:
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
 
# # load or create your dataset
# print('Load data...')
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

all_under_sample=pd.read_csv('under_sample_data_log.csv')
dt1=pd.to_datetime(all_under_sample["register_time"])
all_under_sample["register_time"]=dt1.dt.dayofyear

data = all_under_sample.iloc[0:,1:]
x_prime = data.iloc[0:,0:len(data.columns)-1] 
y = data.iloc[0:,len(data.columns)-1:]
x_prime_train, x_prime_test, y_train, y_test = train_test_split(x_prime, y, train_size=0.6, random_state=0)
 
y_train = y_train['prediction_pay_price'].values
y_test = y_test['prediction_pay_price'].values
X_train = x_prime_train.values
X_test = x_prime_test.values

print('Start training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=63,
                        learning_rate=0.1,
                        n_estimators=100)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=10)
 
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
 
#feature importances
print('Feature importances:', list(gbm.feature_importances_))
 
#other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)
 
param_grid = {
    'learning_rate': [0.01, 0.1,0.2, 1],
    'n_estimators': [50, 400]
}
 
gscv = GridSearchCV(estimator, param_grid, cv = 3, scoring="neg_mean_squared_error")
 
gscv.fit(X_train, y_train)
 
print('Best parameters found by grid search are:', gscv.best_params_)




Start training...
[1]	valid_0's l1: 0.827825
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's l1: 0.75485
[3]	valid_0's l1: 0.689856
[4]	valid_0's l1: 0.631532
[5]	valid_0's l1: 0.579017
[6]	valid_0's l1: 0.532109
[7]	valid_0's l1: 0.490515
[8]	valid_0's l1: 0.453291
[9]	valid_0's l1: 0.420115
[10]	valid_0's l1: 0.390588
[11]	valid_0's l1: 0.364367
[12]	valid_0's l1: 0.340937
[13]	valid_0's l1: 0.32033
[14]	valid_0's l1: 0.302208
[15]	valid_0's l1: 0.286603
[16]	valid_0's l1: 0.273466
[17]	valid_0's l1: 0.262395
[18]	valid_0's l1: 0.253002
[19]	valid_0's l1: 0.245141
[20]	valid_0's l1: 0.238443
[21]	valid_0's l1: 0.232702
[22]	valid_0's l1: 0.227865
[23]	valid_0's l1: 0.223781
[24]	valid_0's l1: 0.22033
[25]	valid_0's l1: 0.217244
[26]	valid_0's l1: 0.214581
[27]	valid_0's l1: 0.212197
[28]	valid_0's l1: 0.21014
[29]	valid_0's l1: 0.208426
[30]	valid_0's l1: 0.206889
[31]	valid_0's l1: 0.205417
[32]	valid_0's l1: 0.204134
[33]	valid_0's l1: 0.203021
[34]	valid_0's l1

In [2]:
all_train=pd.read_csv('tap_fun_train.csv')
dt1=pd.to_datetime(all_train["register_time"])
all_train["register_time"]=dt1.dt.dayofyear

data_train = all_train.iloc[0:,1:]
x_prime2 = data_train.iloc[0:,0:len(data_train.columns)-1] 
y2 = data_train.iloc[0:,len(data_train.columns)-1:]

y_train2 = y2['prediction_pay_price'].values
X_train2 = x_prime2.values

y_pred_train2 = gbm.predict(X_train2, num_iteration=gbm.best_iteration)
#反log
y_pred_train2['prediction_pay_price'] = np.exp(y_pred_train2['prediction_pay_price'])-1
prediction_pay_list = []
for i in range(y_pred_train2.shape[0]):
    prediction_pay_list.append(float('%.2f' %y_pred_train2['prediction_pay_price'][i]))
prediction_pay_df = pd.DataFrame(prediction_pay_list)
y_pred_train2['prediction_pay_price'] = prediction_pay_df

print('The rmse of prediction is:', mean_squared_error(y_train2, y_pred_train2) ** 0.5)

('The rmse of prediction is:', 548.6348295125817)


In [None]:
all_test=pd.read_csv('tap_fun_test_weight.csv')
dt1=pd.to_datetime(all_test["register_time"])
all_test["register_time"]=dt1.dt.dayofyear

data_test = all_test.iloc[0:,1:]
x_prime1 = data_test

X_train = x_prime1.values
y_pred_test = gbm.predict(X_train, num_iteration=gbm.best_iteration)
y_pred_test = pd.DataFrame(y_pred_test)
y_pred_test.columns = ['prediction_pay_price']
y_pred_test['user_id'] = all_test['user_id']
y_pred_test['prediction_pay_price_2'] = y_pred_test['prediction_pay_price']
del y_pred_test['prediction_pay_price']
y_pred_test.rename(columns={'prediction_pay_price_2':'prediction_pay_price'}, inplace = True)
y_pred_test.to_csv("all_test_result_20180721.csv",index=None)