In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
import matplotlib.pyplot as plt
import seaborn as sns
import re
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
from subprocess import check_output

import warnings

def ignore_warn(*args ,**kwargs):
    pass
warnings.warn = ignore_warn

In [2]:
train = pd.read_csv("./data/train_20171226.csv")
test = pd.read_csv("./data/test_20171225.csv")
print(train.shape,test.shape)

(20157, 32) (140, 3)


In [3]:
#获取特征sale_month, sale_year
train['sale_month'] = train['sale_date']%100
train['sale_year'] = train['sale_date']//100%100
test['sale_month'] = test['predict_date']%100
test['sale_year'] = test['predict_date']//100%100

In [4]:
#merge train dataset and fill test dataset
merge_train = pd.DataFrame()
all_class_id = train['class_id'].unique().tolist()
for idx in all_class_id:
    all_month = train['sale_date'][train['class_id']==idx].unique().tolist()
    for mon in all_month:
        max_val = train['sale_quantity'][train['class_id']==idx][train['sale_date']==mon].max()
        sale_sum = train['sale_quantity'][train['class_id']==idx][train['sale_date']==mon].sum()
        feat_val = train[train['class_id']==idx][train['sale_date']==mon][train['sale_quantity']==max_val]
        feat_val['sale_quantity'] = sale_sum
        merge_train = pd.concat([merge_train, feat_val[0:1]])
#含有相同销量的同一class_id
merge_train = merge_train.sort_values(by=['sale_date'], ascending=True)

In [5]:
merge_train = merge_train[['sale_date', 'class_id', 'sale_quantity', 'sale_month', 'sale_year']]
cols = ['sale_date', 'class_id', 'sale_quantity', 'sale_month', 'sale_year']
test = test.ix[:,cols]
test['sale_date'] = test['sale_date'].fillna(201711)
test['sale_date'] = test['sale_date'].astype(int)

In [6]:
count = merge_train[['class_id','sale_quantity']].groupby(['class_id'], as_index=1).count().sort_values(['sale_quantity'])
print('less than 10:',count[count['sale_quantity']<13].count())

less than 10: sale_quantity    29
dtype: int64


In [7]:
print(merge_train.shape, test.shape)
class_12 = count[count['sale_quantity']<13]
class_12 = class_12.index.tolist()
all_data = pd.concat((merge_train, test)).reset_index(drop=True)
for idx in class_12:
    li = []
    li = all_data[all_data['class_id']==idx].index.tolist()
    all_data.drop(all_data.index[li],inplace=True)
    all_data = all_data.reset_index(drop=True)

(5587, 5) (140, 5)


In [8]:
def rolling(df,roll,window):
    last=roll(df,window=window).tolist()
    last=[np.NaN]+last
    last.pop()
    return last

new = pd.DataFrame()
for a in all_class_id:
    df = all_data[all_data['class_id'] == a]
    df['last_1_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=1)
    df['last_2_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=2)
    df['last_3_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=3)
    df['last_4_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=4)
    df['last_5_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=5)
    df['last_6_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=6)
    df['last_7_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=7)
    df['last_8_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=8)
    df['last_9_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=9)
    df['last_10_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=10)
    df['last_11_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=11)
    df['last_12_sum'] = rolling(df['sale_quantity'], pd.rolling_sum, window=12)
    
    df['move_1'] = df['last_1_sum']
    df['move_2'] = df['last_2_sum'] - df['last_1_sum']
    df['move_3'] = df['last_3_sum'] - df['last_2_sum']
    df['move_4'] = df['last_4_sum'] - df['last_3_sum']
    df['move_5'] = df['last_5_sum'] - df['last_4_sum']
   
    df['move_12'] = df['last_12_sum'] - df['last_11_sum']
    
    new = pd.concat([new, df])

In [9]:
data = new.drop(['last_1_sum', 'last_2_sum', 'last_3_sum', 'last_4_sum', 'last_5_sum',
                 'last_6_sum', 'last_7_sum', 'last_8_sum', 'last_9_sum', 'last_10_sum',
                 'last_11_sum', 'last_12_sum'], axis=1)
data.head()

Unnamed: 0,sale_date,class_id,sale_quantity,sale_month,sale_year,move_1,move_2,move_3,move_4,move_5,move_12
3461,201605,289403,9.0,5,16,,,,,,
3589,201606,289403,9.0,6,16,9.0,,,,,
3668,201607,289403,18.0,7,16,9.0,9.0,,,,
3765,201608,289403,120.0,8,16,18.0,9.0,9.0,,,
3861,201609,289403,222.0,9,16,120.0,18.0,9.0,9.0,,


In [10]:
test = data[data['sale_quantity'].isnull()]
test.shape

(111, 11)

In [11]:
train_X = data.dropna()
train_X.head()

Unnamed: 0,sale_date,class_id,sale_quantity,sale_month,sale_year,move_1,move_2,move_3,move_4,move_5,move_12
4804,201705,289403,222.0,5,17,213.0,359.0,231.0,400.0,171.0,9.0
4908,201706,289403,179.0,6,17,222.0,213.0,359.0,231.0,400.0,9.0
4993,201707,289403,375.0,7,17,179.0,222.0,213.0,359.0,231.0,18.0
5072,201708,289403,264.0,8,17,375.0,179.0,222.0,213.0,359.0,120.0
5157,201709,289403,299.0,9,17,264.0,375.0,179.0,222.0,213.0,222.0


In [16]:
#spilt dataset
val_data = train_X[train_X['sale_date']==201710]
test = test.drop(['sale_date', 'sale_quantity'], axis=1)
train_y = train_X['sale_quantity']
train_x = train_X.drop(['sale_date', 'sale_quantity'], axis=1)
print(train_x.shape, train_y.shape, test.shape, len(train_x['class_id'].unique().tolist()),len(test['class_id'].unique().tolist()))

ValueError: labels ['sale_date' 'sale_quantity'] not contained in axis

In [None]:
val_data = train_x[trian_x['sale_date']==201710]

In [380]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet,Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

In [381]:
def rmsle_cv(model, train_x, train_y):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x)
    rmse= np.sqrt(-cross_val_score(model, train_x, train_y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def xgb_Regressor(train_x, train_y, test_x):
    xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
    scores = rmsle_cv(xgb_model, train_x, train_y)  
    xgb_model.fit(train_x, train_y)
    pred_train = xgb_model.predict(train_x)
    score = rmsle(train_y, pred_train)
    
    return xgb_model, scores.mean(), score, pred_train

def lgb_Regressor(train_x, train_y, test_x):
    lgb_model = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 30)
    scores = rmsle_cv(lgb_model, train_x, train_y)  
    lgb_model.fit(train_x, train_y)
    pred_train = lgb_model.predict(train_x)
    score = rmsle(train_y, pred_train)
    
    return lgb_model, scores.mean(), score, pred_train
def base_model():
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
    nn = MLPRegressor(
    hidden_layer_sizes = (90, 90),
    alpha = 2.75
    )

    return ENet,lasso,nn

def gboost_Regressor(train_x, train_y, test_x):
    gb_model = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=20, min_samples_split=20, 
                    loss='huber', random_state =5)
    scores = rmsle_cv(gb_model, train_x, train_y)  
    gb_model.fit(train_x, train_y)
    pred_train = gb_model.predict(train_x)
    score = rmsle(train_y, pred_train)
    
    return gb_model, scores.mean(), score, pred_train


In [383]:
print("XGBOOSTRegressor开始训练...")
xgb_model, scores, score, xgb_train_pred = xgb_Regressor(train_x, train_y, test)
print(scores, score)
xgb_pred = xgb_model.predict(test)

XGBOOSTRegressor开始训练...
231.219423641 66.1516684112


In [382]:
print("LGBMRegressor开始训练...")
lgb_model, scores, score, lgb_train_pred= lgb_Regressor(train_x, train_y, test)
print(scores, score)
lgb_pred = lgb_model.predict(test)

LGBMRegressor开始训练...
232.951719749 176.470519867


In [None]:
print("GDBTRegressor开始训练...")
gb_model, scores, score, gb_train_pred = gboost_Regressor(train_x, train_y, test)
print(scores, score)
gb_pred = gb_model.predict(test)

GDBTRegressor开始训练...


In [None]:
ENet,lasso,nn = base_model()
    
ENet.fit(train_x, train_y)
enet_pred = ENet.predict(train_x)
print("ENet:", rmsle(train_y, enet_pred))
    
lasso.fit(train_x, train_y)
lasso_pred = lasso.predict(train_x)
print("lasso:", rmsle(train_y, lasso_pred))
    
nn.fit(train_x, train_y)
nn_pred = nn.predict(train_x)
print("nn:", rmsle(train_y, nn_pred))

In [None]:
# ####Stacking####
print('Stacking...')
stacked_averaged_models = StackingRegressor(
regressors=[ nn, ENet,lasso, lgb_model],
meta_regressor= xgb_model
)
stacked_averaged_models.fit(train_x, train_y)
stacked_train_pred = stacked_averaged_models.predict(train_x.values)
stacked_pred = stacked_averaged_models.predict(test.values)
print(rmsle(train_y, stacked_train_pred))
print(rmsle(train_y,stacked_train_pred*0.40 + gb_train_pred*0.20 + 
       lgb_train_pred*0.20 + xgb_train_pred*0.20))
ensemble = stacked_pred*0.40 + gb_pred*0.20  + lgb_pred*0.20 + xgb_pred*0.20

In [343]:
#submission
test['sale_quantity'] = lgb_pred
cols = test['class_id'].unique().tolist()
sub = pd.read_csv("./data/test_20171225.csv")
for idx in cols:
    sale = test['sale_quantity'][test['class_id']==idx]
    sub['predict_quantity'][sub['class_id']==idx] = list(sale)[0]
for idx in class_12:
    sale = merge_train['sale_quantity'][merge_train['class_id']==idx][merge_train['sale_date']==201710]
    sub['predict_quantity'][sub['class_id']==idx] = list(sale)[0]
sub.to_csv('submission_lgb_diff1_5_move_1_5.csv',index=False)

In [342]:
lgb_pred

array([  237.33906174,   708.60840866,   174.67134746,   156.64716595,
         668.31703533,   204.59043725,   479.99352857,   373.50549489,
         137.53269373,   180.54592017,   496.21709019,   654.86017452,
         347.91908877,   154.40015577,   100.77552838,    56.36663376,
         229.15375718,  2695.35015035,   579.09155915,   278.41082129,
         233.2004149 ,  1636.75985902,   153.8067285 ,   455.01414008,
         482.78063927,  1670.8226792 ,   151.966536  ,   126.55534463,
         169.86675179,   219.29170252,   389.77364281,   209.63164293,
          92.0665864 ,   221.50931004,    58.01087757,   170.29122296,
         101.67050931,   169.21980119,   319.64874862,   292.21923231,
        1027.44758111,   612.71941506,   149.19752623,   361.50218997,
         191.68343411,   199.19155288,    95.69039253,   386.61183877,
         151.90473792,   276.22494402,   227.76666622,   240.10228822,
          33.17556674,    82.44160172,   204.04395657,   465.62919576,
      

In [314]:
test

Unnamed: 0,class_id,sale_month,sale_year,move_1,move_2,move_3,move_4,move_5,move_6,move_7,move_8,move_9,move_10,move_11,move_12,diff,sale_quantity
5398,289403,11,17,272.0,299.0,264.0,375.0,179.0,222.0,213.0,359.0,231.0,400.0,171.0,204.0,,235.047918
5468,745137,11,17,852.0,911.0,780.0,859.0,646.0,533.0,376.0,590.0,476.0,822.0,1673.0,1278.0,,832.505916
5461,714860,11,17,138.0,217.0,238.0,213.0,169.0,186.0,165.0,185.0,138.0,435.0,510.0,313.0,,179.712940
5381,175962,11,17,237.0,214.0,103.0,119.0,143.0,167.0,134.0,301.0,151.0,396.0,633.0,411.0,,198.200207
5394,270690,11,17,595.0,806.0,701.0,883.0,864.0,826.0,692.0,1095.0,1497.0,691.0,1565.0,1124.0,,768.197468
5399,290854,11,17,205.0,393.0,285.0,81.0,71.0,241.0,161.0,214.0,179.0,222.0,232.0,232.0,,233.742359
5457,692703,11,17,551.0,580.0,476.0,556.0,505.0,533.0,406.0,458.0,289.0,551.0,905.0,731.0,,501.938087
5487,978089,11,17,400.0,401.0,419.0,270.0,381.0,512.0,466.0,548.0,494.0,558.0,1042.0,400.0,,397.940866
5388,219195,11,17,125.0,160.0,155.0,220.0,130.0,240.0,200.0,260.0,275.0,265.0,630.0,490.0,,157.496885
5473,851857,11,17,157.0,255.0,225.0,196.0,186.0,137.0,138.0,187.0,98.0,254.0,304.0,275.0,,163.401161
