In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load the training data
train_data = pd.read_csv("data/train_transformed.csv")

In [3]:
eva_data = pd.read_csv("data/evaluation_transformed.csv")

In [4]:
train_data.head()

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
0,0,0,68460,1101,1226,1.112523,0,0,0,0,...,-1.606137e-15,-1.992248e-13,-6.077335e-13,-1.726278e-12,7.402787e-13,2.37598e-14,2.692723e-13,5.657295e-13,-2.124336e-14,9.444088e-14
1,1,0,309,51,202,3.884615,0,0,0,0,...,0.01206801,0.002715082,0.01318767,0.03157979,0.02662453,0.01621955,-0.02204838,0.00931604,0.00328826,0.07966161
2,2,0,3241,1675,2325,1.387232,0,0,0,0,...,0.05095493,-0.001131896,0.03852378,0.116344,0.15423,0.2938599,0.4018045,0.190963,0.3218782,-0.04933187
3,3,0,32327,667,304,0.45509,0,0,0,0,...,0.008873053,0.001403727,0.003399734,0.01401861,0.009761796,0.009215406,0.006693422,0.008483338,-0.002502589,0.01261141
4,4,0,581,42,127,2.953488,0,0,0,0,...,0.02774364,-0.001756079,0.00824523,0.03155745,0.0138588,0.005119571,0.003617241,0.01908667,-0.01212654,0.01883319


In [5]:
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train_all, X_test_all, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], train_size=0.7, test_size=0.3)


In [6]:
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_count','url_count', 'hashtag_count', 'text_length']
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_all[features_need_scaled])
X_train_all[features_need_scaled] = scaler.transform(X_train_all[features_need_scaled])
X_test_all[features_need_scaled] = scaler.transform(X_test_all[features_need_scaled])
eva_data[features_need_scaled] = scaler.transform(eva_data[features_need_scaled])
X_train_all.head()

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
249976,249976,0,0.434115,-0.096133,-0.141457,-0.152902,0,-0.232108,0,-0.674532,...,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0
583170,583170,0,-0.404448,-0.096042,-0.123487,-0.124417,0,-0.232108,1,1.364984,...,0.121115,-0.054569,-0.189247,0.391549,-0.153283,-0.041249,-0.017224,-0.072703,0.046831,-0.024759
416882,416882,0,-0.387247,-0.088715,-0.152388,-0.224277,0,-0.232108,0,-0.674532,...,0.034345,-0.011252,0.036042,0.038673,0.058022,0.117585,0.050072,0.570273,0.761369,-0.170455
40485,40485,1,0.40114,-0.006517,7.478273,-0.159899,0,-0.232108,1,1.364984,...,0.291657,-0.053307,-0.116755,-0.130682,0.04218,0.014364,0.002779,-0.005117,0.004489,0.008621
618716,618716,1,0.223808,-0.090402,-0.118222,-0.219739,0,-0.232108,1,1.364984,...,0.660604,0.411702,-0.054703,-0.156545,0.002126,-0.013461,-0.010475,-0.021559,-0.015615,-0.019898


## Random forest enhanced SVM

In [14]:
def rf_svr(X_train, y_train, max_depth, n_estimators):
    y_train_log = np.log(y_train+1)
    
    # random forest
    # for training residual RF
    reg = RandomForestRegressor(max_depth = max_depth,   
                                n_estimators = n_estimators, 
                                random_state = 7,  
                                n_jobs = 10, 
                                verbose = 5)  

    reg.fit(X_train, y_train_log)

    reg_y_train_log_predict = reg.predict(X_train) # this produces a ndarray
    
    # svr
    svr_y_train = y_train_log - reg_y_train_log_predict
    svr = LinearSVR()
    svr.fit(X_train, svr_y_train) 

    return svr, reg

### Cross Validation

In [9]:
X_train = train_data[features_selected]
y_train = train_data['retweet_count']
eval_data = pd.read_csv("data/evaluation_transformed.csv")
X_val = eval_data[features_selected]


features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_count','url_count', 'hashtag_count', 'text_length']
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

In [10]:
X_train.shape

(665777, 19)

In [12]:
features_selected=['user_verified', 'user_statuses_count', 'user_followers_count',
                 'user_friends_count', 'ratio_friends_followers', 'mention_exist',
                 'mention_count', 'url_exist', 'url_count', 'hashtag_exist',
                 'hashtag_count', 'weekend', 'text_length', 'sentiment_pos', 
                 'sentiment_neg', 'sentiment_neu', 'tf_idf_0', 'tf_idf_1', 'tf_idf_2'] 

In [11]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state = 43)
KFold(5, shuffle = True, random_state = 43).split(X_train, y_train)

<generator object _BaseKFold.split at 0x0000024AA53260C8>

In [15]:
results_cv=[]

for train_index, test_index in kf.split(np.array(X_train)):
    X_train_tmp, X_test_tmp = pd.DataFrame(np.array(X_train)[train_index]), pd.DataFrame(np.array(X_train)[test_index])
    y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]
    
    # normalization
     
    features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_count','url_count', 'hashtag_count', 'text_length']
    X_train_tmp.columns = features_selected
    X_test_tmp.columns = features_selected
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train_tmp[features_need_scaled])
    X_train_tmp[features_need_scaled] = scaler.transform(X_train_tmp[features_need_scaled])
    X_test_tmp[features_need_scaled] = scaler.transform(X_test_tmp[features_need_scaled])
    
    # train
    svr, reg = rf_svr(X_train_tmp, y_train_tmp, 18, 100)
    
    # predict
    rf_predict_y_log = reg.predict(X_test_tmp)

    svr_predict_y_log = svr.predict(X_test_tmp)

    y_predict = np.exp(svr_predict_y_log + rf_predict_y_log) -1

    print(mean_absolute_error(y_test_tmp, y_predict))
    results_cv.append(mean_absolute_error(y_test_tmp, y_predict))

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100building tree 5 of 100

building tree 6 of 100building tree 7 of 100

building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:   59.8s


building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.7min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.3s finished


136.27668378516705


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100

building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100building tree 13 of 100

building tree 14 of 100
building tree 15 of 100
building tree 16 of 100building tree 17 of 100

building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100building tree 26 of 100

building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:  1.3min


building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  2.1min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.9s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


139.69827846738562


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100building tree 10 of 100

building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100building tree 16 of 100

building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:  1.1min


building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.9min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.4s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


134.95241012617979


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:   52.0s


building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.5min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.6s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.2s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


146.71589803266176


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 100
building tree 2 of 100building tree 3 of 100

building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100building tree 8 of 100

building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100building tree 17 of 100

building tree 18 of 100
building tree 19 of 100building tree 20 of 100

building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:  1.2min


building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100building tree 88 of 100

building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.9min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.8s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


130.68529422640623


In [16]:
print(np.array(results_cv).mean())

137.6657129275601


## Evaluation and dump

In [13]:
X_train = train_data[features_selected]
y_train = train_data['retweet_count']
eval_data = pd.read_csv("data/evaluation_transformed.csv")
X_val = eval_data[features_selected]


features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_count','url_count', 'hashtag_count', 'text_length']
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

In [14]:
y_train.shape

(665777,)

In [15]:
X_train.shape

(665777, 19)

In [16]:
svr2, reg2 = rf_svr(X_train, y_train, 18, 100)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100building tree 40 of 100

building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:  1.5min


building tree 62 of 100building tree 63 of 100

building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100building tree 83 of 100

building tree 84 of 100building tree 85 of 100

building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  2.4min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.3s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.5s finished


Prediction error: 128.24960290528995


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    1.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.9s finished


In [17]:
svr_predict_y_log_eva = svr1.predict(X_val)

rf_predict_y_log_eva = reg1.predict(X_val)

y_predict_eva = np.exp(svr_predict_y_log_eva + rf_predict_y_log_eva) - 1

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    0.5s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.9s finished


In [18]:
# Dump the results into a file that follows the required Kaggle template
with open("prediction/rf_enhanced_svr_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_predict_eva):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])