## Import Packages

In [5]:
import pandas as pd
import numpy as np
import time
import pickle
import csv
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from deepctr.models import xDeepFM, DeepFM
from deepctr.feature_column import  SparseFeat, DenseFeat, get_feature_names
import os
from sklearn.model_selection import train_test_split

## Load and preprocess data

In [29]:
train_data = pd.read_csv("data/train_transformed.csv")

In [30]:
train_data.head(1)

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
0,0,0,68460,1101,1226,1.112523,0,0,0,0,...,-1.606137e-15,-1.992248e-13,-6.077335e-13,-1.726278e-12,7.402787e-13,2.37598e-14,2.692723e-13,5.657295e-13,-2.124336e-14,9.444088e-14


In [31]:
train, test = train_test_split(train_data, test_size=0.3)

In [32]:
y_train = train['retweet_count']  #series
y_test = test['retweet_count']    #series
X_train = train.drop(['retweet_count','id'], axis=1)   #dataframe
X_test = test.drop(['retweet_count','id'], axis=1)     #dataframe

## MAE metric

In [10]:
# the evaluation metric  计算MAE分数
# the inputs need to be np.darray

def MAE(y_predict, y_true):
    n = len(y_predict)
    res = 0
    for i in range(n):
        res = res + abs(y_predict[i]-y_true[i])
    return res/n

## Train functions

### LRRF

In [1]:
def train_lrrf(X_train_lr, y_train_lr):
    """ Train and store LRRF (LR + Random Forest) """
    reg = LinearRegression(fit_intercept=False).fit(X_train_lr.values, y_train_lr)
    filename = './model/lr_demo.sav'  #每次改参数后文件名也要改，方便查询
    pickle.dump(reg, open(filename, 'wb'))
    lr_y_train_predict = reg.predict(X_train_lr.values) # this produces a ndarray
    print("The training MAE for linear regression is {}".format(MAE(lr_y_train_predict, np.array(y_train_lr))))
    
    # update the log file
    logname = 'log.txt'
    str = "{}: LinearRegressor(fit_intercept=False) \n\n".format(filename) #这里的参数也要改，方便查询日志
    with open(logname, 'a+') as f:
        f.write(str) 
    
    # 随机森林部分
    # for training residual RF
    rf_y_train = y_train_lr - lr_y_train_predict
    
    
    reg = RandomForestRegressor(max_depth = 20,   #参数
                                n_estimators = 500, #参数
                                random_state = 7,  #参数
                                n_jobs = 10, #同时进行任务的个数，论文里为3
                                verbose = 5)  
    start_time = time.time()
    reg.fit(X_train_lr.values, rf_y_train, )
    elapsed_time = time.time() - start_time
    print("took {} seconds for fitting".format(elapsed_time))
    
    # save randomforest regressor
    filename = './model/lrrf_demo.sav'  #每次改参数后文件名也要改，方便查询
    pickle.dump(reg, open(filename, 'wb'))
    
    # update the log file
    logname = 'log.txt'
    str = " {}: Random forest regressor: max_depth=20, n_estimators=500, random_state=7 \n\n".format(filename)  #这里的参数也要改，方便查询日志
    with open(logname, 'a+') as f:
        f.write(str) 

### NNRF

In [2]:
def train_nnrf(X_train_nn, y_train_nn):
    """ Train and store NNRF (Neural Networks - MLP + Random Forest) """
    
    ########### regression
    reg = MLPRegressor(random_state=7,  # 参数
                        hidden_layer_sizes=(64,32,16,8,8),  # 参数
                        batch_size=1024,  # 参数
                        learning_rate_init=.01,
                        early_stopping=False,
                        verbose=True,
                        shuffle=True,
                        n_iter_no_change=10)
    start_time = time.time()
    reg.fit(X_train_nn.values, y_train_nn)
    elapsed_time = time.time() - start_time
    print("took {} seconds for fitting".format(elapsed_time))
    
    filename = './model/nn_demo10.sav' #每次改参数后文件名也要改，方便查询
    pickle.dump(reg, open(filename, 'wb'))
    
    # update the log file
    logname = 'log.txt'
    str = "{}: MLPRegressor(batch_size=4096,hidden_layer_sizes=(128,64,32,8),\
    random_state=211) \n\n".format(filename) #这里的参数也要改，方便查询日志
    
    with open(logname, 'a+') as f:
        f.write(str) 
        
    ############ Random Forest
    nn_y_train_predict = reg.predict(X_train_nn.values)
    print("The training MAE for neural networks is {}".format(MAE(nn_y_train_predict, np.array(y_train_nn))))
    
    # for training residual RF
    rf_y_train = y_train_nn - nn_y_train_predict
    
    reg = RandomForestRegressor(max_depth=18, # 参数
                                n_estimators=500, # 参数
                                random_state=7, # 参数
                                n_jobs=10, #同时进行任务的个数，论文里为3
                                verbose=5)
    start_time = time.time()
    reg.fit(X_train_nn.values, rf_y_train, )
    elapsed_time = time.time() - start_time
    print("took {} seconds for fitting".format(elapsed_time))
    
    # save randomforest regressor 
    filename = './model/nnrf_demo10.sav'  #每次改参数后文件名也要改，方便查询
    pickle.dump(reg, open(filename, 'wb'))
    
    # update the log file
    logname = 'log.txt'
    str = "{}: RandomForestRegressor(n_estimators=500, max_depth=18, random_state=211) \n\n".format(filename)  
    with open(logname, 'a+') as f:
        f.write(str) 

### XdeepFMRF

In [3]:
def train_xdeepfmrf(X_train, y_train):
    """ Train and store FMRF (xDeepFM + Random Forest) """
    features = X_train.columns.values.tolist()
    sparse_features = ["timeseg", "day_of_week"] # 如果训练集里没有这两个变量则sparse_features = []
    dense_features = [f for f in features if f not in ["timeseg", "day_of_week"]]
    
    def encoding(data, feat, encoder):
        data[feat] = encoder.fit_transform(data[feat])
    
    [encoding(X_train, feat, LabelEncoder()) for feat in sparse_features]
    
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=4) \
                              for i, feat in enumerate(sparse_features)]
    dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
    
    # features to be used for dnn part of xdeepfm
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns
    # features to be used for linear part of xdeepfm
    linear_feature_columns = sparse_feature_columns + dense_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    train_model_input = {name: X_train[name].values for name in feature_names}
    
    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression', seed=28)
    # compiling the model
    model.compile("adam", "mse", metrics=['mse'], )
    # training the model
    history = model.fit(train_model_input, y_train, batch_size=256, epochs=20, verbose=2)
    filename = './model/xDeepFM_w.h5'
    model.save_weights(filename) 

    # update the log file
    logname = 'log.txt'
    str = "{}: xdeepfm weights (batch_size=256, epochs=20, verbose=2) \n\n".format(filename)  
    with open(logname, 'a+') as f:
        f.write(str) 
    
    # random forest
    pred = model.predict(train_model_input, batch_size=256)
    rf_y_train = y_train - pred.reshape(X_train.shape[0], )
    
    reg = RandomForestRegressor(max_depth=16,
                                max_features=.5,
                                n_estimators=500,
                                random_state=28,
                                n_jobs=10,
                                verbose=5)
    start_time = time.time()
    reg.fit(X_train[features].values, rf_y_train, )
    elapsed_time = time.time() - start_time
    print("took {} seconds for fitting".format(elapsed_time))
    filename = './model/randomforest_regressor_500e_xdeepfm_rs211_28.sav'
    pickle.dump(reg, open(filename, 'wb'))
    
    # update the log file
    logname = 'log.txt'
    str = "{}: Random forest for xdeepfm (n_estimators=500, max_depth=16, random_state=28) \n\n".format(filename)  
    with open(logname, 'a+') as f:
        f.write(str) 

### DeepFMRF

In [4]:
def train_deepfmrf(X_train, y_train):
    """ Train and store FMRF (DeepFM + Random Forest) """
    
    features = X_train.columns.values.tolist()
    sparse_features = ["timeseg", "day_of_week"] # 如果训练集里没有这两个变量则sparse_features = []
    dense_features = [f for f in features if f not in ["timeseg", "day_of_week"]]
    
    def encoding(data, feat, encoder):
        data[feat] = encoder.fit_transform(data[feat])

    [encoding(X_train, feat, LabelEncoder()) for feat in sparse_features]
    

    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=X_train[feat].nunique(), embedding_dim=4) \
                              for i, feat in enumerate(sparse_features)]
    dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
    
    # features to be used for dnn part of xdeepfm
    dnn_feature_columns = dense_feature_columns
    # features to be used for linear part of xdeepfm
    linear_feature_columns = dense_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    train_model_input = {name: X_train[name].values for name in feature_names}
    
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', seed=29)
    
    # compiling the model
    model.compile("adam", "mse", metrics=['mse'], )
    # training the model
    history = model.fit(train_model_input, y_train, batch_size=4096, epochs=150, verbose=2)
    filename = './model/xDeepFM_w_seed29.h5'
    model.save_weights(filename)


    # update the log file
    logname = 'log.txt'
    str = "{}: deepfm weights (batch_size=4096, epochs=150, verbose=2) \n\n".format(filename)  
    with open(logname, 'a+') as f:
        f.write(str) 
    
    
    pred = model.predict(train_model_input, batch_size=4096)
    rf_y_train = y_train - pred.reshape(X_train.shape[0], )
    
    # random forest
    reg = RandomForestRegressor(max_depth=16,
                                max_features=.5,
                                n_estimators=500,
                                random_state=29,
                                n_jobs=10,
                                verbose=5)
    start_time = time.time()
    reg.fit(X_train[features].values, rf_y_train, )
    elapsed_time = time.time() - start_time
    print("took {} seconds for fitting".format(elapsed_time))
    filename = './model/randomforest_regressor_500e_xdeepfm_rs211_29.sav'
    pickle.dump(reg, open(filename, 'wb'))

    # update the log file
    logname = 'log.txt'
    str = "{}: Random forest for deepfm (n_estimators=500, max_depth=16, random_state=29) \n\n".format(filename)  
    with open(logname, 'a+') as f:
        f.write(str) 

## Train functions

### LRRF

In [179]:
def lrrf_predict(X_test_lr, y_test_lr):

    # model 1 - LRRF
    # load lr
    filename = './model/lr_demo.sav'
    reg = pickle.load(open(filename, 'rb'))
    lr_y_predict = reg.predict(X_test_lr.values)

    # load rf
    filename = './model/lrrf_demo.sav'
    reg = pickle.load(open(filename, 'rb'))
    rf_predict = reg.predict(X_test_lr.values)

    result_list = list()
    for e in reg.estimators_:
        result_list.append(e.predict(X_test_lr.values))

    result_list = np.array(result_list)
    print(result_list.shape)

    # combine
    y_predict = lr_y_predict + rf_predict
    print("The testing MAE for lrrf is {}".format(MAE(y_predict, np.array(y_test_lr))))
    # save results
    np.savetxt("output/model1.predict", y_predict.astype(int), fmt='%i')

### NNRF

In [181]:
def nnrf_predict(X_test_nn, y_test_nn):
    
    model_dict = {
        './model/nn_demo1.sav': './model/nnrf_demo1.sav', # model 2
        './model/nn_demo2.sav': './model/nnrf_demo2.sav', # model 3
        './model/nn_demo3.sav': './model/nnrf_demo3.sav', # model 4
        './model/nn_demo4.sav': './model/nnrf_demo4.sav', # model 5
        './model/nn_demo5.sav': './model/nnrf_demo5.sav', # model 6
        './model/nn_demo6.sav': './model/nnrf_demo6.sav', # model 7
        './model/nn_demo7.sav': './model/nnrf_demo7.sav', # model 8
        './model/nn_demo8.sav': './model/nnrf_demo8.sav',
        './model/nn_demo9.sav': './model/nnrf_demo9.sav',
        './model/nn_demo10.sav': './model/nnrf_demo10.sav'
    }
    
    for idx, regressor_path in enumerate(model_dict.keys()):
        regr = pickle.load(open(regressor_path, 'rb'))
        nn_y_val_predict = regr.predict(X_test_nn.values)

        # load rf
        filename = model_dict[regressor_path]
        reg = pickle.load(open(filename, 'rb'))
        rf_val_predict = reg.predict(X_test_nn.values)

        # combine
        y_val_predict = nn_y_val_predict + rf_val_predict
        np.savetxt("output/model{}.predict".format(idx+2), y_val_predict.astype(int), fmt='%i')

### FMRF

In [23]:
def fmrf_predict(val):
    
    ############ Model xDeepFM
    
    # load rf
    filename = './model/randomforest_regressor_500e_xdeepfm_rs211_28.sav'
    reg = pickle.load(open(filename, 'rb'))
    rf_val_predict = reg.predict(X_test.values)
    
    # DeepFM
    features = val.columns.values.tolist()
    sparse_features = ["timeseg", "day_of_week"]  # 如果训练集里没有这两个变量则sparse_features = []
    dense_features = [f for f in features if f not in ["timeseg", "day_of_week"]]
    
    def encoding(data, feat, encoder):
        data[feat] = encoder.fit_transform(data[feat])

    [encoding(val, feat, LabelEncoder()) for feat in sparse_features]
    
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=val[feat].nunique(), embedding_dim=4) \
                              for i, feat in enumerate(sparse_features)]
    dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
    print(len(dense_feature_columns))
    
    # features to be used for dnn part of xdeepfm
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns
    # features to be used for linear part of xdeepfm
    linear_feature_columns = sparse_feature_columns + dense_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
    model.load_weights('./model/xDeepFM_w.h5')
    
    test_model_input = {name: val[name].values for name in feature_names}

    deepfm_pred = model.predict(test_model_input, batch_size=256)
    deepfm_pred = deepfm_pred.reshape(rf_val_predict.shape)
    deepfm_pred_counter = deepfm_pred + rf_val_predict
    
    np.savetxt("output/model12.predict", deepfm_pred_counter.astype(int), fmt='%i')
    
    ############   Model DeepFM
    
    # RF
    # load rf
    filename = './model/randomforest_regressor_500e_xdeepfm_rs211_29.sav'
    reg = pickle.load(open(filename, 'rb'))
    rf_val_predict = reg.predict(val[features].values)

    ###########################
    # DeepFM

    sparse_features = ["timeseg", "day_of_week"]  # 如果训练集里没有这两个变量则sparse_features = []
    dense_features = [f for f in features if f not in ["timeseg", "day_of_week"]]
    
    def encoding(data, feat, encoder):
        data[feat] = encoder.fit_transform(data[feat])

    [encoding(val, feat, LabelEncoder()) for feat in sparse_features]

    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=val[feat].nunique(), embedding_dim=4) \
                              for i, feat in enumerate(sparse_features)]
    dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
    print(len(dense_feature_columns))
    
    # features to be used for dnn part of xdeepfm
    dnn_feature_columns = dense_feature_columns
    # features to be used for linear part of xdeepfm
    linear_feature_columns = dense_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
    model.load_weights('./model/xDeepFM_w_seed29.h5')
    
    test_model_input = {name: val[name].values for name in feature_names}

    deepfm_pred = model.predict(test_model_input, batch_size=256)
    deepfm_pred = deepfm_pred.reshape(rf_val_predict.shape)
    deepfm_pred_counter = deepfm_pred + rf_val_predict
    np.savetxt("output/model13.predict", deepfm_pred_counter.astype(int), fmt='%i')

### Global predict

In [None]:
def global_model_predict(val):
    if not os.path.exists('output'):
        os.makedirs('output')
        print('created output folder as it is not existing...')
        
    if not os.path.exists('output/model1.predict'):
        lrrf_predict(val)
    if not os.path.exists('output/model2.predict'):
        nnrf_predict(val)
    if not os.path.exists('output/model12.predict'):
        fmrf_predict(val)
        
    # load separate prediction files
    predict = pd.read_csv('output/model1.predict', header=None)
    predict.columns = ["model1"]

    model2_predict = pd.read_csv('output/model2.predict', header=None)
    model2_predict.columns = ["yhat"]
    model3_predict = pd.read_csv('output/model3.predict', header=None)
    model3_predict.columns = ["yhat"]
    model4_predict = pd.read_csv('output/model4.predict', header=None)
    model4_predict.columns = ["yhat"]
    model5_predict = pd.read_csv('output/model5.predict', header=None)
    model5_predict.columns = ["yhat"]
    model6_predict = pd.read_csv('output/model6.predict', header=None)
    model6_predict.columns = ["yhat"]
    model7_predict = pd.read_csv('output/model7.predict', header=None)
    model7_predict.columns = ["yhat"]
    model8_predict = pd.read_csv('output/model8.predict', header=None)
    model8_predict.columns = ["yhat"]
    model9_predict = pd.read_csv('output/model9.predict', header=None)
    model9_predict.columns = ["yhat"]
    model10_predict = pd.read_csv('output/model10.predict', header=None)
    model10_predict.columns = ["yhat"]
    model11_predict = pd.read_csv('output/model11.predict', header=None)
    model11_predict.columns = ["yhat"]
    model12_predict = pd.read_csv('output/model12.predict', header=None)
    model12_predict.columns = ["yhat"]
    model13_predict = pd.read_csv('output/model13.predict', header=None)
    model13_predict.columns = ["yhat"]
    model14_predict = pd.read_csv('output/model14.predict', header=None)
    model14_predict.columns = ["yhat"]

    # arithmetic mean
    predict["model2"] = model2_predict["yhat"]
    predict["model3"] = model3_predict["yhat"]
    predict["model4"] = model4_predict["yhat"]
    predict["model5"] = model5_predict["yhat"]
    predict["model6"] = model6_predict["yhat"]
    predict["model7"] = model7_predict["yhat"]
    predict["model8"] = model8_predict["yhat"] * 2
    predict["model9"] = model9_predict["yhat"]
    predict["model10"] = model10_predict["yhat"]
    predict["model11"] = model11_predict["yhat"]
    predict["model12"] = model12_predict["yhat"]
    predict["model13"] = model13_predict["yhat"] * 2
    predict["model14"] = model14_predict["yhat"]
    predict["yhatavg"] = (
                            predict["model1"] + predict["model2"] \
                            + predict["model3"] + predict["model4"] \
                            + predict["model5"] + predict["model6"] \
                            + predict["model7"] + predict["model8"] \
                            + predict["model9"] + predict["model10"] \
                            + predict["model11"] + predict["model12"] \
                            + predict["model13"] + predict["model14"] \
                        ) / 16.0

    np.savetxt("output/temp.predict", np.round(predict["yhatavg"].values).astype(int), fmt='%i')

In [3]:
def Global_score(y_test_lr):
    global_predict = pd.read_csv('output/global结果的名字.predict', header=None)
    return (MAE(np.array(global_predict), np.array(y_test_lr)))

## Train

0: user_verified       $\qquad \qquad$ 

1: user_statuses_count $\qquad \qquad$ 

2: user_followers_count

3: user_friends_count    

4: ratio_friends_followers $\qquad \qquad$ 

5: mention_exist $\qquad \qquad$ 

6: mention_count

7: url_exist $\qquad \qquad$ 

8: url_count $\qquad \qquad$ 

9: hashtag_exist

10: hashtag_count $\qquad \qquad$ 

11: timeseg $\qquad \qquad$ 

12: weekend

13: day_of_week$\qquad \qquad$

14: text_length$\qquad \qquad$

15: sentiment_pos

16: sentiment_neg$\qquad \qquad$

17: sentiment_neu$\qquad \qquad$

18: sentiment_comp

19: tf_idf_0$\qquad \qquad$

20: tf_idf_1

21: tf_idf_2

22: tf_idf_3

23: tf_idf_4

24: tf_idf_5

25: tf_idf_6

26: tf_idf_7

27: tf_idf_8

28: tf_idf_9

In [48]:
def get_features_set(X_train, features_number):
    features = X_train.columns.values.tolist()
    cols = []
    for i in features_number:
        cols.append(features[i])
    return X_train[cols]

In [49]:
X_train_lr = get_test_set(X_train, [1,2,4])
y_train_lr = y_train

In [None]:
train_lrrf(X_train_lr, y_train_lr)

In [None]:
train_nnrf(X_train_lr, y_train_lr)

In [None]:
train_xdeepfmrf(X_train_lr, y_train_lr)

In [None]:
train_deepfmrf(X_train_lr, y_train_lr)

## Test

In [47]:
X_test_lr = get_test_set(X_test, [1,2,4])

In [36]:
y_test_lr = y_test

In [None]:
lrrf_predict(X_test_lr, y_test_lr)

In [None]:
nnrf_predict(X_test_lr, y_test_lr)

In [None]:
fmrf_predict(X_test_lr)

In [None]:
global_model_predict(X_test_lr)

In [None]:
# final score
Global_score(y_test_lr)