In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor

## Load data

In [2]:
# Load the training data
train_data = pd.read_csv("../data/train_transformed.csv")

In [3]:
train_data.head()

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
0,0,0,68460,1101,1226,1.112523,0,0,0,0,...,-1.606137e-15,-1.992248e-13,-6.077335e-13,-1.726278e-12,7.402787e-13,2.37598e-14,2.692723e-13,5.657295e-13,-2.124336e-14,9.444088e-14
1,1,0,309,51,202,3.884615,0,0,0,0,...,0.01206801,0.002715082,0.01318767,0.03157979,0.02662453,0.01621955,-0.02204838,0.00931604,0.00328826,0.07966161
2,2,0,3241,1675,2325,1.387232,0,0,0,0,...,0.05095493,-0.001131896,0.03852378,0.116344,0.15423,0.2938599,0.4018045,0.190963,0.3218782,-0.04933187
3,3,0,32327,667,304,0.45509,0,0,0,0,...,0.008873053,0.001403727,0.003399734,0.01401861,0.009761796,0.009215406,0.006693422,0.008483338,-0.002502589,0.01261141
4,4,0,581,42,127,2.953488,0,0,0,0,...,0.02774364,-0.001756079,0.00824523,0.03155745,0.0138588,0.005119571,0.003617241,0.01908667,-0.01212654,0.01883319


## Gradient Boosting Regressor


In [4]:
def gbr_predict(X_train, y_train, X_test):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    return gbr.predict(X_test)

###  1) Cross validation

In [21]:
from sklearn.model_selection import KFold

In [67]:
features_need_scaled=['user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_count','url_count', 'hashtag_count', 'text_length']

def cv(X,y,regressor,kf):
    results_cv=[]
    for train_index, test_index in kf.split(X):
        X_train_tmp, X_test_tmp = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train_tmp, y_test_tmp = y.iloc[train_index], y.iloc[test_index]
        
        scaler = preprocessing.StandardScaler()
        scaler.fit(X_train_tmp[features_need_scaled])
        X_train_tmp[features_need_scaled] = scaler.transform(X_train_tmp[features_need_scaled])
        X_test_tmp[features_need_scaled] = scaler.transform(X_test_tmp[features_need_scaled])
        
        regressor.fit(X_train_tmp, y_train_tmp)
        y_predict = regressor.predict(X_test_tmp)
        score = mean_absolute_error(y_test_tmp, y_predict)
        print('tmp score: ',score)
        results_cv.append(score)
    return np.mean(results_cv)

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 43)
regressor = GradientBoostingRegressor()

features_selected = ['user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_exist', 'mention_count', 'url_exist', 'url_count', 'hashtag_exist', 'hashtag_count', 'timeseg', 'weekend', 'day_of_week', 'text_length', 'sentiment_pos', 'sentiment_neg', 'sentiment_neu']
X = train_data[features_selected]
y = train_data['retweet_count']
score = cv(X,y,regressor,kf)
print('Cross validation score:', score)

tmp score:  143.05085732532754


### 2) Train on whole data set

In [29]:
# use all data to train he model
X_train= train_data[features_selected]
y_train = train_data['retweet_count']
eval_data = pd.read_csv("../data/evaluation_transformed.csv")
X_val = eval_data[features_selected]

scaler = preprocessing.StandardScaler()
scaler.fit(X_train[features_need_scaled])
X_train[features_need_scaled] = scaler.transform(X_train[features_need_scaled])
X_val[features_need_scaled] = scaler.transform(X_val[features_need_scaled])


y_pred = gbr_predict(X_train,y_train,X_val)

# Dump the results into a file that follows the required Kaggle template
with open("prediction/gbr_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])