In [7]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
# Load the training data
train_data = pd.read_csv("data/train_transformed.csv")

In [3]:
eva_data = pd.read_csv("data/evaluation_transformed.csv")

In [4]:
train_data.head()

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
0,0,0,68460,1101,1226,1.112523,0,0,0,0,...,-1.606137e-15,-1.992248e-13,-6.077335e-13,-1.726278e-12,7.402787e-13,2.37598e-14,2.692723e-13,5.657295e-13,-2.124336e-14,9.444088e-14
1,1,0,309,51,202,3.884615,0,0,0,0,...,0.01206801,0.002715082,0.01318767,0.03157979,0.02662453,0.01621955,-0.02204838,0.00931604,0.00328826,0.07966161
2,2,0,3241,1675,2325,1.387232,0,0,0,0,...,0.05095493,-0.001131896,0.03852378,0.116344,0.15423,0.2938599,0.4018045,0.190963,0.3218782,-0.04933187
3,3,0,32327,667,304,0.45509,0,0,0,0,...,0.008873053,0.001403727,0.003399734,0.01401861,0.009761796,0.009215406,0.006693422,0.008483338,-0.002502589,0.01261141
4,4,0,581,42,127,2.953488,0,0,0,0,...,0.02774364,-0.001756079,0.00824523,0.03155745,0.0138588,0.005119571,0.003617241,0.01908667,-0.01212654,0.01883319


In [5]:
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train_all, X_test_all, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], train_size=0.7, test_size=0.3)


In [6]:
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_count','url_count', 'hashtag_count', 'text_length']
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_all[features_need_scaled])
X_train_all[features_need_scaled] = scaler.transform(X_train_all[features_need_scaled])
X_test_all[features_need_scaled] = scaler.transform(X_test_all[features_need_scaled])
eva_data[features_need_scaled] = scaler.transform(eva_data[features_need_scaled])
X_train_all.head()

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
621957,621957,0,-0.383191,-0.095283,-0.041342,0.69217,0,-0.231249,0,-0.675342,...,0.658434,0.725856,0.077128,-0.108096,-0.051642,-0.033,-0.013369,-0.05708,0.011628,-0.060725
479798,479798,0,0.061909,-0.081055,0.021332,-0.214172,0,-0.231249,0,-0.675342,...,0.346298,-0.413063,0.801847,0.144007,-0.157371,-0.071168,-0.013226,-0.057253,-0.023296,-0.057848
312849,312849,0,-0.30982,-0.095253,-0.152957,-0.131321,0,-0.231249,1,1.364308,...,0.652465,-0.185419,-0.523064,0.404621,-0.213981,-0.070115,-0.037405,-0.147147,0.08643,-0.065926
321181,321181,0,0.500588,-0.094681,-0.03691,-0.092401,0,-0.231249,0,-0.675342,...,0.037329,0.002418,0.003632,0.057988,0.024457,0.040928,0.017294,0.041428,-0.017589,0.174764
336415,336415,0,0.161973,-0.092153,-0.147276,-0.218517,0,-0.231249,0,-0.675342,...,0.02433,0.002255,0.007894,0.027406,0.015418,0.020338,0.002823,0.076444,-0.01872,0.18646


## Multi-layer perceptron enhanced random forest 

In [14]:
def mlp_rf(X_train, y_train, random_state, hidden_layer_sizes, batch_size, max_depth, n_estimators):
    
    # MLP部分
    mlp = MLPRegressor(random_state= 77,  
                        hidden_layer_sizes= hidden_layer_sizes,  
                        batch_size= batch_size,  
                        learning_rate_init=.01,
                        early_stopping=False,
                        verbose=True,
                        shuffle=True,
                        n_iter_no_change=10)
    
    y_train_log = np.log(y_train+1)
    
    mlp.fit(X_train, y_train_log) 
    
    mlp_y_train_predict = mlp.predict(X_train) # this produces a ndarray
    print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred = np.exp(mlp.predict(X_test))-1 ))
    
    # 随机森林部分
    # for training residual RF
    rf_y_train = y_train_log - mlp_y_train_predict
    
    reg = RandomForestRegressor(max_depth = max_depth,   
                                n_estimators = n_estimators, 
                                random_state = 7,  
                                n_jobs = 10, 
                                verbose = 5)  

    reg.fit(X_train, rf_y_train)
    return mlp, reg

In [55]:
features_selected=['user_verified', 'user_statuses_count', 'user_followers_count',
                 'user_friends_count', 'ratio_friends_followers', 'mention_exist',
                 'mention_count', 'url_exist', 'url_count', 'hashtag_exist',
                 'hashtag_count', 'weekend', 'text_length', 'sentiment_pos', 
                 'sentiment_neg', 'sentiment_neu', 'tf_idf_0', 'tf_idf_1', 'tf_idf_2'] 

### Cross Validation

In [31]:
from sklearn.model_selection import KFold

In [42]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 43)

In [32]:
KFold(5, shuffle = True, random_state = 43).split(X_train, y_train)

<generator object _BaseKFold.split at 0x000002CA4965A9C8>

In [56]:
X_train = train_data[features_selected]
y_train = train_data['retweet_count']
eval_data = pd.read_csv("data/evaluation_transformed.csv")
X_val = eval_data[features_selected]


features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_count','url_count', 'hashtag_count', 'text_length']
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

In [None]:
results_cv=[]

for train_index, test_index in kf.split(np.array(X_train)):
    X_train_tmp, X_test_tmp = pd.DataFrame(np.array(X_train)[train_index]), pd.DataFrame(np.array(X_train)[test_index])
    y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]
    mlp, reg = mlp_rf(X_train_tmp, y_train_tmp, 41, (128,64,32,8), 512, 18, 100)
    
    mlp_predict_y_log = mlp.predict(X_test_tmp)

    rf_predict_y_log = reg.predict(X_test_tmp)

    y_predict = np.exp(mlp_predict_y_log + rf_predict_y_log) -1

    print(mean_absolute_error(y_test_tmp, y_predict))
    results_cv.append(mean_absolute_error(y_test_tmp, y_predict))
print(results_cv.mean())

Iteration 1, loss = 0.58280993
Iteration 2, loss = 0.54012431
Iteration 3, loss = 0.53117392
Iteration 4, loss = 0.52751973
Iteration 5, loss = 0.52695528
Iteration 6, loss = 0.52305152
Iteration 7, loss = 0.52090465
Iteration 8, loss = 0.51890148
Iteration 9, loss = 0.51895809
Iteration 10, loss = 0.51742144
Iteration 11, loss = 0.51572674
Iteration 12, loss = 0.51475043
Iteration 13, loss = 0.51310208
Iteration 14, loss = 0.51176104
Iteration 15, loss = 0.51085639
Iteration 16, loss = 0.51081340
Iteration 17, loss = 0.51084512
Iteration 18, loss = 0.50960153
Iteration 19, loss = 0.51003399
Iteration 20, loss = 0.51020377
Iteration 21, loss = 0.50773390
Iteration 22, loss = 0.50821287
Iteration 23, loss = 0.50781050
Iteration 24, loss = 0.50649050
Iteration 25, loss = 0.50609081
Iteration 26, loss = 0.50546725
Iteration 27, loss = 0.50535102
Iteration 28, loss = 0.50552186
Iteration 29, loss = 0.50475888
Iteration 30, loss = 0.50396550
Iteration 31, loss = 0.50361109
Iteration 32, los