In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold

## 1. Load data

In [3]:
# Load the training data
train_data = pd.read_csv("../data/train_transformed.csv")
# Load the evaluation data
eval_data = pd.read_csv("../data/evaluation_transformed.csv")

In [4]:
# hyperparameter for pre-process and select features 
#features that need to be scaled
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'text_length']
# features that we select to regressor
features_selected = ['user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_exist', 'url_exist','hashtag_exist',  'timeseg', 'weekend', 'day_of_week', 'text_length', 'sentiment_comp']

#tuning the parameter of regressor: n_estimators and max_depth
n_estimators=100
max_depth = 18

## 2. Cross-validation for train and test

In [5]:
# cross-validation
def cv(X,y,regressor,kf):
    results_cv=[]
    for train_index, test_index in kf.split(X):
        X_train_tmp, X_test_tmp = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train_tmp, y_test_tmp = y.iloc[train_index], y.iloc[test_index]
        
        # normalize some features in X_train and use the same parametres to normalize these features in X_test
        scaler = preprocessing.StandardScaler()
        scaler.fit(X_train_tmp[features_need_scaled])
        X_train_tmp[features_need_scaled] = scaler.transform(X_train_tmp[features_need_scaled])
        X_test_tmp[features_need_scaled] = scaler.transform(X_test_tmp[features_need_scaled])
        
            
        regressor.fit(X_train_tmp, y_train_tmp)
        y_predict = regressor.predict(X_test_tmp)
        score = mean_absolute_error(y_test_tmp, y_predict)
        print('tmp score: ',score)
        results_cv.append(score)
    return np.mean(results_cv)

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 43)
regressor = GradientBoostingRegressor(n_estimators=n_estimators,
                                verbose=5,
                            max_depth = max_depth,
                            random_state =12)# we tuning the parameter here n_estimators and max_depth


X = train_data[features_selected]
y = train_data['retweet_count']
score = cv(X,y,regressor,kf)
print('Cross validation score:', score)

      Iter       Train Loss   Remaining Time 
         1     8129645.7748           19.66m
         2     6927385.5224           19.51m
         3     5988160.1579           19.72m
         4     5183901.6940           20.23m
         5     4481595.0423           19.96m
         6     3930670.7326           19.75m


## 3. Predict for evaluation

In [22]:
# use all data to train he model
X_train= train_data[features_selected]
y_train = train_data['retweet_count']
X_val = eval_data[features_selected]

# normalize some features in X_train and use the same parametres to normalize these features in X_eval
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

gbr = GradientBoostingRegressor(n_estimators=n_estimators,
                                verbose=5,
                            max_depth = max_depth,
                            random_state =12)# we tuning the parameter here n_estimators and max_depth

gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_val)

# Dump the results into a file that follows the required Kaggle template
with open("../prediction/gbr_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

      Iter       Train Loss   Remaining Time 
         1     7465057.4044            1.97m
         2     6330405.2560            1.63m
         3     5404586.9504            1.42m
         4     4615048.3612            1.18m
         5     4012546.4749           57.59s
         6     3488148.5188           45.39s
         7     3054434.7075           34.24s
         8     2682124.0347           23.41s
         9     2372040.4132           11.94s
        10     2097931.6903            0.00s
