In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold

## 1. Load data

In [9]:
# Load the training data
train_data = pd.read_csv("../data/train_transformed_sample.csv")
# Load the evaluation data
eval_data = pd.read_csv("../data/evaluation_transformed.csv")

In [10]:
# hyperparameter for pre-process and select features 
#features that need to be scaled
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'text_length']
# features that we select to regressor
features_selected = ['user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_exist', 'url_exist','hashtag_exist',  'timeseg', 'weekend', 'day_of_week', 'text_length', 'sentiment_comp']

#tuning the parameter of regressor: n_estimators and max_depth
n_estimators=10
max_depth = 18

## 2. Cross-validation for train and test

In [11]:
# cross-validation
def cv(X,y,regressor,kf):
    results_cv=[]
    for train_index, test_index in kf.split(X):
        X_train_tmp, X_test_tmp = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train_tmp, y_test_tmp = y.iloc[train_index], y.iloc[test_index]
        
        # normalize some features in X_train and use the same parametres to normalize these features in X_test
        scaler = preprocessing.StandardScaler()
        scaler.fit(X_train_tmp[features_need_scaled])
        X_train_tmp[features_need_scaled] = scaler.transform(X_train_tmp[features_need_scaled])
        X_test_tmp[features_need_scaled] = scaler.transform(X_test_tmp[features_need_scaled])
        
            
        regressor.fit(X_train_tmp, y_train_tmp)
        y_predict = regressor.predict(X_test_tmp)
        score = mean_absolute_error(y_test_tmp, y_predict)
        print('tmp score: ',score)
        results_cv.append(score)
    return np.mean(results_cv)

In [12]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 43)
regressor = GradientBoostingRegressor(n_estimators=n_estimators,
                                verbose=5,
                            max_depth = max_depth,
                            random_state =12)# we tuning the parameter here n_estimators and max_depth


X = train_data[features_selected]
y = train_data['retweet_count']
score = cv(X,y,regressor,kf)
print('Cross validation score:', score)

      Iter       Train Loss   Remaining Time 
         1     1300997.0924            0.60s
         2     1053839.8163            0.55s
         3      853640.6506            0.47s
         4      691476.9514            0.41s
         5      560106.0803            0.34s
         6      453693.0212            0.27s
         7      367498.2690            0.20s
         8      297680.3574            0.13s
         9      241128.2727            0.07s
        10      195319.5612            0.00s
tmp score:  142.5046591976996
      Iter       Train Loss   Remaining Time 
         1     1375771.8882            0.77s
         2     1114376.4257            0.66s
         3      902646.0826            0.60s
         4      731144.4484            0.62s
         5      592227.8843            0.50s
         6      479705.3253            0.38s
         7      388562.3310            0.28s
         8      314736.0970            0.18s
         9      254936.7794            0.09s
        10      206499.

## 3. Predict for evaluation

In [13]:
# use all data to train he model
X_train= train_data[features_selected]
y_train = train_data['retweet_count']
X_val = eval_data[features_selected]

# normalize some features in X_train and use the same parametres to normalize these features in X_eval
scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])

gbr = GradientBoostingRegressor(n_estimators=n_estimators,
                                verbose=5,
                            max_depth = max_depth,
                            random_state =12)# we tuning the parameter here n_estimators and max_depth

gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_val)

# Dump the results into a file that follows the required Kaggle template
with open("../prediction/gbr_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

      Iter       Train Loss   Remaining Time 
         1     1171629.0524            0.79s
         2      949028.5435            0.73s
         3      768721.6367            0.68s
         4      622672.5273            0.56s
         5      504372.2723            0.48s
         6      408547.9873            0.38s
         7      330929.9869            0.28s
         8      268059.8849            0.19s
         9      217133.4453            0.09s
        10      175883.7233            0.00s
