In [8]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

## 1. Load data

In [9]:
# Load the training data
train_data = pd.read_csv("../data/train_transformed.csv")
#load the evaluation data
eva_data = pd.read_csv("../data/evaluation_transformed.csv")

In [10]:
# hyperparameter for pre-process and select features 
#features that need to be scaled
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'text_length']
# features that we select to regressor
features_selected = ['user_verified', 'user_statuses_count', 'user_followers_count', 
                     'user_friends_count', 'ratio_friends_followers', 'mention_exist',
                     'url_exist','hashtag_exist',  'timeseg', 'weekend', 'day_of_week',
                     'text_length', 'sentiment_comp']

#tuning the parameter of regressor: n_estimators and max_depth
n_estimators = 100
max_depth = 18 

In [11]:
# the model enhanced that we define
def svr_rf(X_train, y_train, max_depth, n_estimators):
    # svr
    svr = LinearSVR()
    y_train_log = np.log(y_train+1)
    svr.fit(X_train, y_train_log) 
    svr_y_train_predict = svr.predict(X_train) # this produces a ndarray
    
    # random forest
    # for training residual RF
    rf_y_train = y_train_log - svr_y_train_predict
    
    reg = RandomForestRegressor(max_depth = max_depth,   
                                n_estimators = n_estimators, 
                                random_state = 7,  
                                n_jobs = 10, 
                                verbose = 5)  

    reg.fit(X_train, rf_y_train)
    return svr, reg

### 2. Corss-validation for train and test

In [None]:
X_train = train_data[features_selected]
y_train = train_data['retweet_count']

kf = KFold(n_splits = 5, shuffle = True, random_state = 43)
results_cv=[]

for train_index, test_index in kf.split(np.array(X_train)):
    X_train_tmp, X_test_tmp = pd.DataFrame(np.array(X_train)[train_index]), pd.DataFrame(np.array(X_train)[test_index])
    y_train_tmp, y_test_tmp = y_train[train_index], y_train[test_index]
    
    
    #  normalize some features in X_train and use the same parametres to normalize these features in X_test
    X_train_tmp.columns = features_selected
    X_test_tmp.columns = features_selected
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train_tmp[features_need_scaled])
    X_train_tmp[features_need_scaled] = scaler.transform(X_train_tmp[features_need_scaled])
    X_test_tmp[features_need_scaled] = scaler.transform(X_test_tmp[features_need_scaled])
    
    # train
    svr, reg = svr_rf(X_train_tmp, y_train_tmp, max_depth, n_estimators)
    
    # predict
    svr_predict_y_log = svr.predict(X_test_tmp)

    rf_predict_y_log = reg.predict(X_test_tmp)

    y_predict = np.exp(svr_predict_y_log + rf_predict_y_log) -1

    print(mean_absolute_error(y_test_tmp, y_predict))
    results_cv.append(mean_absolute_error(y_test_tmp, y_predict))

score=np.array(results_cv).mean()
print('Cross validation score:',score)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10

building tree 7 of 10building tree 8 of 10
building tree 9 of 10
building tree 10 of 10



[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   13.6s remaining:   31.8s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   14.6s remaining:    9.7s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   14.8s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


135.13848965611126


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10building tree 2 of 10

building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10building tree 7 of 10

building tree 8 of 10
building tree 9 of 10building tree 10 of 10



[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   14.4s remaining:   33.7s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   14.4s remaining:    9.6s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   14.5s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


140.1738258097865


## 3. Predict for evaluation

In [None]:
# use all data to train he model
X_train = train_data[features_selected]
y_train = train_data['retweet_count']
X_val = eva_data[features_selected]

# normalize some features in X_train and use the same parametres to normalize these features in X_eval
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

svr2, reg2 = svr_rf(X_train, y_train, max_depth, n_estimators)
svr_predict_y_log_eva = svr2.predict(X_val)

rf_predict_y_log_eva = reg2.predict(X_val)

y_predict_eva = np.exp(svr_predict_y_log_eva + rf_predict_y_log_eva) - 1

#Dump the results into a file that follows the required Kaggle template
with open("../prediction/svr_rf_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_predict_eva):
        writer.writerow([str(eva_data['id'].iloc[index]) , str(int(prediction))])