In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold

## 1. Load data

In [3]:
# Load the training data
train_data = pd.read_csv("../data/train_transformed.csv")
# Load the evaluation data
eval_data = pd.read_csv("../data/evaluation_transformed.csv")

In [8]:
# hyperparameter for pre-process and select features 
#features that need to be scaled
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'text_length']
# features that we select to regressor
features_selected = ['user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_exist', 'url_exist','hashtag_exist',  'timeseg', 'weekend', 'day_of_week', 'text_length', 'sentiment_comp']

#tuning the parameter of regressor: n_estimators and max_depth
n_estimators=100
max_depth = 18

## 2. Cross-validation for train and test

In [6]:
# cross-validation
def cv(X,y,regressor,kf):
    results_cv=[]
    for train_index, test_index in kf.split(X):
        X_train_tmp, X_test_tmp = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train_tmp, y_test_tmp = y.iloc[train_index], y.iloc[test_index]
        
        # normalize some features in X_train and use the same parametres to normalize these features in X_test
        scaler = preprocessing.StandardScaler()
        scaler.fit(X_train_tmp[features_need_scaled])
        X_train_tmp[features_need_scaled] = scaler.transform(X_train_tmp[features_need_scaled])
        X_test_tmp[features_need_scaled] = scaler.transform(X_test_tmp[features_need_scaled])
        
        y_train_tmp=np.log(y_train_tmp+1)
            
        regressor.fit(X_train_tmp, y_train_tmp)
        y_predict = regressor.predict(X_test_tmp)
        score = mean_absolute_error(y_test_tmp, np.exp(y_predict)-1.0)
        print('tmp score: ',score)
        results_cv.append(score)
    return np.mean(results_cv)

In [9]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 43)
regressor = GradientBoostingRegressor(n_estimators=n_estimators,
                                verbose=5,
                            max_depth = max_depth,
                            random_state =12)# we tuning the parameter here n_estimators and max_depth


X = train_data[features_selected]
y = train_data['retweet_count']
score = cv(X,y,regressor,kf)
print('Cross validation score:', score)

      Iter       Train Loss   Remaining Time 
         1           2.4837            1.45m
         2           2.1064            1.35m
         3           1.7937            1.11m
         4           1.5350           55.82s
         5           1.3205           46.98s
         6           1.1426           37.53s
         7           0.9945           27.75s
         8           0.8710           18.24s
         9           0.7678            9.05s
        10           0.6816            0.00s
tmp score:  141.75663472848188
      Iter       Train Loss   Remaining Time 
         1           2.4804            1.29m
         2           2.1007            1.19m
         3           1.7880            1.06m
         4           1.5292           54.24s
         5           1.3148           46.07s
         6           1.1356           36.57s
         7           0.9899           27.98s
         8           0.8675           18.49s
         9           0.7644            9.13s
        10           0

## 3. Predict for evaluation 

In [11]:
# use all data to train he model
X_train= train_data[features_selected]
y_train = train_data['retweet_count']
X_val = eval_data[features_selected]

# normalize some features in X_train and use the same parametres to normalize these features in X_eval
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

log_gbr = GradientBoostingRegressor(n_estimators=n_estimators,
                                verbose=5,
                            max_depth = max_depth,
                            random_state =12)# we tuning the parameter here n_estimators and max_depth

y_train= np.log(y_train+1.)
log_gbr.fit(X_train, y_train)
y_pred = log_gbr.predict(X_val)
y_pred= np.exp(y_pred)-1.0

# Dump the results into a file that follows the required Kaggle template
with open("../prediction/log_gbr_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

      Iter       Train Loss   Remaining Time 
         1           2.4848            1.88m
         2           2.1145            1.63m
         3           1.8076            1.46m
         4           1.5540            1.23m
         5           1.3439           59.09s
         6           1.1697           46.50s
         7           1.0244           34.10s
         8           0.9028           22.43s
         9           0.8004           11.09s
        10           0.7154            0.00s
