In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import KFold

## 1. Load data

In [12]:
# Load the training data
train_data = pd.read_csv("../data/train_transformed.csv")
#load the evaluation data
eval_data = pd.read_csv("../data/evaluation_transformed.csv")

In [4]:
# hyperparameter for pre-process and select features 
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'text_length']
features_selected = ['user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_exist', 'url_exist','hashtag_exist',  'timeseg', 'weekend', 'day_of_week', 'text_length', 'sentiment_comp']
#tuning the parameter of regressor: n_estimators and max_depth
n_estimators=500
max_depth = 18

## 2. Corss-validation for train and test

In [13]:
# cross-validation
def cv(X,y,regressor,kf):
    results_cv=[]
    for train_index, test_index in kf.split(X):
        X_train_tmp, X_test_tmp = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train_tmp, y_test_tmp = y.iloc[train_index], y.iloc[test_index]
        
        # normalize some features in X_train and use the same parametres to normalize these features in X_test
        scaler = preprocessing.StandardScaler()
        scaler.fit(X_train_tmp[features_need_scaled])
        X_train_tmp[features_need_scaled] = scaler.transform(X_train_tmp[features_need_scaled])
        X_test_tmp[features_need_scaled] = scaler.transform(X_test_tmp[features_need_scaled])
        
        y_train_tmp=np.log(y_train_tmp+1.0)
        
        regressor.fit(X_train_tmp, y_train_tmp)
        y_predict = regressor.predict(X_test_tmp)
        score = mean_absolute_error(y_test_tmp, np.exp(y_predict)-1.)
        print('tmp score: ',score)
        results_cv.append(score)
    return np.mean(results_cv)

In [6]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 43)
regressor = RandomForestRegressor(n_estimators=n_estimators,
                                n_jobs=10,
                                verbose=5,
                            max_depth =  max_depth,
                            random_state =12
                           )# tuning the parameter here n_estimators and max_depth


X = train_data[features_selected]
y = train_data['retweet_count']
score = cv(X,y,regressor,kf)
print('Cross validation score:', score)


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10building tree 7 of 10

building tree 8 of 10building tree 9 of 10
building tree 10 of 10



[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   12.7s remaining:   29.9s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   15.1s remaining:   10.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   15.2s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


tmp score:  136.74621806155997


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10building tree 8 of 10

building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   13.7s remaining:   32.0s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   14.0s remaining:    9.3s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   14.1s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


tmp score:  139.96868448634658


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10building tree 5 of 10
building tree 6 of 10
building tree 7 of 10

building tree 8 of 10building tree 9 of 10

building tree 10 of 10


[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   12.3s remaining:   28.8s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   13.2s remaining:    8.8s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   13.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


tmp score:  133.4792775299853


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10building tree 10 of 10



[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   12.0s remaining:   28.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   13.2s remaining:    8.8s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   13.4s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


tmp score:  148.16628359711476


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10building tree 2 of 10

building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10building tree 8 of 10
building tree 9 of 10
building tree 10 of 10



[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   12.3s remaining:   28.9s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   13.9s remaining:    9.3s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   14.0s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


tmp score:  128.68430820882207
Cross validation score: 137.40895437676573


## 3. Predict for evaluation

In [10]:
# use all data to train he model
X_train=train_data[features_selected]
y_train = train_data['retweet_count']
X_val = X_val = eval_data[features_selected]

# normalize some features in X_train and use the same parametres to normalize these features in X_eval
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

reg = RandomForestRegressor(n_estimators=n_estimators,
                                n_jobs=10,
                                verbose=5,
                            max_depth = max_depth,
                            random_state =12
                           )# tuning the parameter here n_estimators and max_depth

y_train= np.log(y_train+1.)
reg.fit(X_train,y_train)
y_pred= reg.predict(X_val)
y_pred= np.exp(y_pred)-1.0

# Dump the results into a file that follows the required Kaggle template
with open("../prediction/rf_log_comp_noCount_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10building tree 5 of 10

building tree 6 of 10
building tree 7 of 10
building tree 8 of 10building tree 9 of 10building tree 10 of 10




[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   19.4s remaining:   45.3s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   19.6s remaining:   13.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   20.1s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.1s finished
