In [1]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, root_mean_squared_log_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import root_mean_squared_log_error
import pickle

import numpy as np
import xgboost as xgb
from typing import Tuple

In [2]:
# df_pinned_scaled = pd.read_csv('data/data_pin_scaled.csv', index_col=0)
df_scaled = pd.read_csv('data/data_scaled.csv', index_col=0)

In [3]:
# df_pinned_scaled = pd.get_dummies(df_pinned_scaled, columns=['genre1'], drop_first=True)
df_scaled = pd.get_dummies(df_scaled, columns=['genre1'], drop_first=True)


In [4]:
prevars = ['sentiment_prob_pos', 'sentiment_prob_neg', 'lexdiv_cttr', 'num_punct',
           'num_sentences', 'SMOG_readability', 'second_person', 'user_follower',
           'cosine_1', 'hours_since_article', 'votes_pos_mean', 'votes_neg_mean',
           'article_comments', 'is_root_comment', 'level_in_tree'] + [x for x in df_scaled.columns if 'genre1' in x]
pinned_var = ['pinned_f']
engagevars = ['is_leaf_comment', 'size_of_tree', 'height_of_tree', 'all_replies']
votes_vars = ['votes_neg_log', 'votes_pos_log']

pinned_varset = [prevars, prevars+engagevars, prevars+engagevars+votes_vars]
votes_pos_varset = [prevars, prevars+pinned_var, prevars+pinned_var+engagevars,
                    prevars+pinned_var+engagevars+votes_vars[:1]]
votes_neg_varset = [prevars, prevars+pinned_var, prevars+pinned_var+engagevars,
                    prevars+pinned_var+engagevars+votes_vars[1:]]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[pinned_varset[2]], df_scaled['pinned_f'], test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")


Accuracy: 0.9994141574462251
Precision: 0.9513466550825369
Recall: 0.9279661016949152
F1: 0.9395109395109396


In [9]:
#  pickle models
with open('models/xgb_pinned_models.pkl', 'rb') as f:
    models = pickle.load(f)
print(len(models))

3


In [None]:
# models = []

for exvars in pinned_varset[len(models):]:

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['pinned_f'],
                                                        test_size=0.2, random_state=42)

    # cross validated random parameter search
    xgb_model = xgb.XGBClassifier()
    params = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}
    rs = RandomizedSearchCV(xgb_model, params, n_iter=20, n_jobs=-1, cv=5, verbose=10)
    rs.fit(X_train, y_train)

    # do grid search around best parameters from random search
    best_params = rs.best_params_
    print(f"Best parameters: {best_params}")
    del rs
    params = {
        'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
        'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
        'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
    gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
    gs.fit(X_train, y_train)

    # predict
    y_pred = gs.predict(X_test)

    # save model
    models.append(gs)

    # evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 2/5; 1/20] START learning_rate=0.05, max_depth=5, n_estimators=900..........
[CV 1/5; 1/20] START learning_rate=0.05, max_depth=5, n_estimators=900..........
[CV 3/5; 1/20] START learning_rate=0.05, max_depth=5, n_estimators=900..........
[CV 4/5; 1/20] START learning_rate=0.05, max_depth=5, n_estimators=900..........
[CV 5/5; 1/20] START learning_rate=0.05, max_depth=5, n_estimators=900..........
[CV 1/5; 2/20] START learning_rate=0.2, max_depth=5, n_estimators=1000..........
[CV 2/5; 2/20] START learning_rate=0.2, max_depth=5, n_estimators=1000..........
[CV 3/5; 2/20] START learning_rate=0.2, max_depth=5, n_estimators=1000..........
[CV 4/5; 2/20] START learning_rate=0.2, max_depth=5, n_estimators=1000..........
[CV 1/5; 3/20] START learning_rate=0.01, max_depth=5, n_estimators=600..........
[CV 5/5; 2/20] START learning_rate=0.2, max_depth=5, n_estimators=1000..........
[CV 2/5; 3/20] START learning_rate=0.01, max_de



[CV 1/5; 7/20] END learning_rate=0.05, max_depth=7, n_estimators=500;, score=0.999 total time= 2.3min
[CV 1/5; 11/20] START learning_rate=0.3, max_depth=3, n_estimators=500..........
[CV 2/5; 11/20] START learning_rate=0.3, max_depth=3, n_estimators=500..........
[CV 2/5; 7/20] END learning_rate=0.05, max_depth=7, n_estimators=500;, score=0.999 total time= 2.4min
[CV 3/5; 11/20] START learning_rate=0.3, max_depth=3, n_estimators=500..........
[CV 3/5; 7/20] END learning_rate=0.05, max_depth=7, n_estimators=500;, score=0.999 total time= 2.4min
[CV 4/5; 11/20] START learning_rate=0.3, max_depth=3, n_estimators=500..........
[CV 5/5; 9/20] END learning_rate=0.01, max_depth=4, n_estimators=300;, score=0.997 total time= 1.1min
[CV 5/5; 11/20] START learning_rate=0.3, max_depth=3, n_estimators=500..........
[CV 4/5; 7/20] END learning_rate=0.05, max_depth=7, n_estimators=500;, score=0.999 total time= 2.3min
[CV 1/5; 12/20] START learning_rate=0.1, max_depth=9, n_estimators=200..........
[CV 

PicklingError: Could not pickle the task to send it to the workers.

In [6]:
exvars = pinned_varset[2]

# train test split
X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                    df_scaled['pinned_f'],
                                                    test_size=0.2, random_state=42)

# cross validated random parameter search
xgb_model = xgb.XGBClassifier()

best_params = {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2}

params = {
    'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
    'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
    'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
gs.fit(X_train, y_train)

# predict
y_pred = gs.predict(X_test)

# save model
models.append(gs)

# evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 4/5; 1/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=900
[CV 5/5; 1/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=900
[CV 2/5; 1/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=900
[CV 1/5; 2/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=1000
[CV 2/5; 2/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=1000
[CV 3/5; 1/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=900
[CV 1/5; 1/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=900
[CV 3/5; 2/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=1000
[CV 4/5; 2/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=1000
[CV 5/5; 2/27] START learning_rate=0.16000000000000003, max_depth=4, n_estimators=1000
[CV 1/5; 3/27] START learning_rate=0.16000000000000003, max_depth=4, n_es



[CV 5/5; 12/27] START learning_rate=0.2, max_depth=4, n_estimators=1100.........
[CV 4/5; 10/27] END learning_rate=0.2, max_depth=4, n_estimators=900;, score=1.000 total time= 3.0min
[CV 1/5; 13/27] START learning_rate=0.2, max_depth=5, n_estimators=900..........
[CV 5/5; 10/27] END learning_rate=0.2, max_depth=4, n_estimators=900;, score=0.999 total time= 3.0min
[CV 2/5; 13/27] START learning_rate=0.2, max_depth=5, n_estimators=900..........
[CV 1/5; 11/27] END learning_rate=0.2, max_depth=4, n_estimators=1000;, score=0.999 total time= 3.3min
[CV 3/5; 13/27] START learning_rate=0.2, max_depth=5, n_estimators=900..........
[CV 2/5; 11/27] END learning_rate=0.2, max_depth=4, n_estimators=1000;, score=1.000 total time= 3.3min
[CV 4/5; 13/27] START learning_rate=0.2, max_depth=5, n_estimators=900..........
[CV 3/5; 11/27] END learning_rate=0.2, max_depth=4, n_estimators=1000;, score=1.000 total time= 3.3min
[CV 5/5; 13/27] START learning_rate=0.2, max_depth=5, n_estimators=900..........
[

In [8]:
#  pickle models
with open('models/xgb_pinned_models.pkl', 'wb') as f:
    pickle.dump(models, f)

In [17]:
# setup RMSLE loss

def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((-np.log1p(predt) + np.log1p(y) + 1) /
            np.power(predt + 1, 2))

def squared_log(predt: np.ndarray,
                dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    '''Squared Log Error objective. A simplified version for RMSLE used as
    objective function.
    '''
    predt[predt < -1] = -1 + 1e-6
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[votes_pos_varset[-1]], df_scaled['votes_pos'], test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
print(f"Votes Pos RMSLE: {err}")

X_train, X_test, y_train, y_test = train_test_split(df_scaled[votes_neg_varset[-1]], df_scaled['votes_neg'], test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
print(f"Votes Neg RMSLE: {err}")

Votes Pos RMSLE: 0.8741124003653106
Votes Neg RMSLE: 0.6779875284346623


In [7]:
with open('models/xgb_votes_pos_models.pkl', 'rb') as f:
    votes_pos_models = pickle.load(f)
print(len(votes_pos_models))
# votes_pos_models = []

for exvars in votes_pos_varset[len(votes_pos_models):]:

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_pos'],
                                                        test_size=0.2, random_state=42)

    # cross validated random parameter search
    xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
    params = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}
    rs = RandomizedSearchCV(xgb_model, params, n_iter=20, n_jobs=-1, cv=5, verbose=10)
    rs.fit(X_train, y_train)

    # do grid search around best parameters from random search
    best_params = rs.best_params_
    print('Best parameters:', best_params)
    del rs
    params = {
        'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
        'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
        'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
    gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
    gs.fit(X_train, y_train)

    # predict
    y_pred = gs.predict(X_test)

    # save model
    votes_pos_models.append(gs)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Pos RMSLE: {err}")


3
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 5/5; 1/20] START learning_rate=0.2, max_depth=4, n_estimators=200...........
[CV 1/5; 1/20] START learning_rate=0.2, max_depth=4, n_estimators=200...........
[CV 1/5; 2/20] START learning_rate=0.1, max_depth=6, n_estimators=600...........
[CV 2/5; 1/20] START learning_rate=0.2, max_depth=4, n_estimators=200...........
[CV 2/5; 2/20] START learning_rate=0.1, max_depth=6, n_estimators=600...........
[CV 3/5; 1/20] START learning_rate=0.2, max_depth=4, n_estimators=200...........
[CV 4/5; 1/20] START learning_rate=0.2, max_depth=4, n_estimators=200...........
[CV 3/5; 2/20] START learning_rate=0.1, max_depth=6, n_estimators=600...........
[CV 4/5; 2/20] START learning_rate=0.1, max_depth=6, n_estimators=600...........
[CV 5/5; 2/20] START learning_rate=0.1, max_depth=6, n_estimators=600...........
[CV 1/5; 3/20] START learning_rate=0.1, max_depth=6, n_estimators=800...........
[CV 2/5; 3/20] START learning_rate=0.1, max_d



[CV 3/5; 13/20] START learning_rate=0.05, max_depth=8, n_estimators=700.........
[CV 3/5; 12/20] END learning_rate=0.01, max_depth=9, n_estimators=400;, score=0.656 total time= 3.8min
[CV 4/5; 13/20] START learning_rate=0.05, max_depth=8, n_estimators=700.........
[CV 1/5; 12/20] END learning_rate=0.01, max_depth=9, n_estimators=400;, score=0.608 total time= 4.3min
[CV 5/5; 13/20] START learning_rate=0.05, max_depth=8, n_estimators=700.........
[CV 4/5; 12/20] END learning_rate=0.01, max_depth=9, n_estimators=400;, score=0.615 total time= 3.8min
[CV 1/5; 14/20] START learning_rate=0.2, max_depth=3, n_estimators=900..........
[CV 5/5; 12/20] END learning_rate=0.01, max_depth=9, n_estimators=400;, score=0.604 total time= 3.8min
[CV 2/5; 14/20] START learning_rate=0.2, max_depth=3, n_estimators=900..........
[CV 1/5; 13/20] END learning_rate=0.05, max_depth=8, n_estimators=700;, score=0.619 total time= 4.2min
[CV 3/5; 14/20] START learning_rate=0.2, max_depth=3, n_estimators=900..........

In [6]:
with open('models/xgb_votes_pos_models.pkl', 'rb') as f:
    votes_pos_models = pickle.load(f)
print(len(votes_pos_models))
    
exvars = votes_pos_varset[len(votes_pos_models)]
X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                    df_scaled['votes_pos'],
                                                    test_size=0.2, random_state=42)

# cross validated random parameter search
xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)

best_params = {'n_estimators': 900, 'max_depth': 7, 'learning_rate': 0.05}

params = {
    'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
    'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
    'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
gs.fit(X_train, y_train)

# predict
y_pred = gs.predict(X_test)

# save model
votes_pos_models.append(gs)

2
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5; 3/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=1000
[CV 1/5; 2/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=900[CV 5/5; 2/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=900

[CV 4/5; 2/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=900[CV 5/5; 1/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=800

[CV 3/5; 1/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=800[CV 1/5; 1/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=800
[CV 3/5; 2/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=900
[CV 4/5; 1/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=800
[CV 2/5; 1/27] START learning_rate=0.04000000000000001, max_depth=6, n_estimators=800
[CV 2/5; 2/27] START learning_rate=0.04000000000000001, max_depth=6, n_estim

In [9]:
# pickle models
with open('models/xgb_votes_pos_models.pkl', 'wb') as f:
    pickle.dump(votes_pos_models, f)

In [6]:
with open('models/xgb_votes_neg_models.pkl', 'rb') as f:
    votes_neg_models = pickle.load(f)

# votes_neg_models = []

print(len(votes_neg_models))


for exvars in votes_neg_varset[len(votes_neg_models):]:

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_neg'],
                                                        test_size=0.2, random_state=42)

    # cross validated random parameter search
    xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
    params = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}
    rs = RandomizedSearchCV(xgb_model, params, n_iter=20, n_jobs=-1, cv=5, verbose=10)
    rs.fit(X_train, y_train)

    # do grid search around best parameters from random search
    best_params = rs.best_params_
    print(best_params)
    params = {
        'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
        'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
        'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
    del rs
    gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
    gs.fit(X_train, y_train)

    # predict
    y_pred = gs.predict(X_test)

    # save model
    votes_neg_models.append(gs)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Neg RMSLE: {err}")

3
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 4/5; 1/20] START learning_rate=0.05, max_depth=4, n_estimators=400..........
[CV 3/5; 1/20] START learning_rate=0.05, max_depth=4, n_estimators=400..........[CV 5/5; 1/20] START learning_rate=0.05, max_depth=4, n_estimators=400..........

[CV 1/5; 1/20] START learning_rate=0.05, max_depth=4, n_estimators=400..........
[CV 2/5; 1/20] START learning_rate=0.05, max_depth=4, n_estimators=400..........
[CV 1/5; 2/20] START learning_rate=0.01, max_depth=9, n_estimators=100..........
[CV 2/5; 2/20] START learning_rate=0.01, max_depth=9, n_estimators=100..........
[CV 3/5; 2/20] START learning_rate=0.01, max_depth=9, n_estimators=100..........
[CV 4/5; 2/20] START learning_rate=0.01, max_depth=9, n_estimators=100..........
[CV 5/5; 2/20] START learning_rate=0.01, max_depth=9, n_estimators=100..........
[CV 1/5; 3/20] START learning_rate=0.05, max_depth=7, n_estimators=400..........
[CV 2/5; 3/20] START learning_rate=0.05, max_



[CV 1/5; 11/20] END learning_rate=0.01, max_depth=10, n_estimators=600;, score=0.356 total time= 4.9min
[CV 5/5; 13/20] START learning_rate=0.01, max_depth=3, n_estimators=600.........
[CV 3/5; 13/20] END learning_rate=0.01, max_depth=3, n_estimators=600;, score=0.266 total time= 1.7min
[CV 2/5; 14/20] START learning_rate=0.2, max_depth=4, n_estimators=100..........
[CV 1/5; 14/20] START learning_rate=0.2, max_depth=4, n_estimators=100..........
[CV 4/5; 13/20] END learning_rate=0.01, max_depth=3, n_estimators=600;, score=0.166 total time= 1.7min
[CV 3/5; 14/20] START learning_rate=0.2, max_depth=4, n_estimators=100..........
[CV 3/5; 11/20] END learning_rate=0.01, max_depth=10, n_estimators=600;, score=0.363 total time= 4.9min
[CV 2/5; 12/20] END learning_rate=0.05, max_depth=9, n_estimators=600;, score=0.250 total time= 3.8min
[CV 5/5; 14/20] START learning_rate=0.2, max_depth=4, n_estimators=100..........
[CV 1/5; 12/20] END learning_rate=0.05, max_depth=9, n_estimators=600;, score=

PicklingError: Could not pickle the task to send it to the workers.

In [7]:

with open('models/xgb_votes_neg_models.pkl', 'rb') as f:
    votes_neg_models = pickle.load(f)
print(len(votes_neg_models))
    
exvars = votes_neg_varset[len(votes_neg_models)]

# train test split
X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                    df_scaled['votes_neg'],
                                                    test_size=0.2, random_state=42)

# cross validated random parameter search
xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)

# do grid search around best parameters from random search
best_params = {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.05}
print(best_params)
params = {
    'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
    'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
    'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
gs.fit(X_train, y_train)

# predict
y_pred = gs.predict(X_test)

# save model
votes_neg_models.append(gs)

# evaluate
err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
print(f"Votes Neg RMSLE: {err}")

3
{'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.05}
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 3/5; 1/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=900
[CV 1/5; 1/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=900[CV 4/5; 1/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=900

[CV 2/5; 1/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=900
[CV 5/5; 1/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=900
[CV 1/5; 2/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=1000
[CV 2/5; 2/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=1000
[CV 3/5; 2/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=1000
[CV 5/5; 2/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=1000
[CV 4/5; 2/27] START learning_rate=0.04000000000000001, max_depth=4, n_estimators=1000
[CV 1/5; 

In [10]:
# pickle models
with open('models/xgb_votes_neg_models.pkl', 'wb') as f:
    pickle.dump(votes_neg_models, f)

## Output predictions for final models

In [5]:
full_data = pd.read_csv('data/comment_data_general_091022_untilarticle5874_final_redacted.csv', index_col=0)

In [10]:
with open('models/xgb_pinned_models.pkl', 'rb') as f:
    models = pickle.load(f)

In [12]:
n = -1
exvars = pinned_varset[n]
model = models[n]
pred_pinned_xgb = pd.DataFrame({'article': full_data['article'],
                                'comment_id': full_data['comment_id'],
                                'pred_pinned': model.predict_proba(df_scaled[exvars])[:,1]}) 
pred_pinned_xgb.to_csv('model_output/xgbs/pred_pinned_xgb.csv')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[pinned_varset[2]], df_scaled['pinned_f'], test_size=0.2, random_state=42)

xgb_model = models[-1]
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")


Accuracy: 0.9994764811221586
Precision: 0.948892674616695
Recall: 0.9440677966101695
F1: 0.9464740866610025


In [18]:
with open('models/xgb_votes_pos_models.pkl', 'rb') as f:
    votes_pos_models = pickle.load(f)

In [19]:
for n, model in enumerate(votes_pos_models):
    exvars = votes_pos_varset[n]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_pos'],
                                                        test_size=0.2, random_state=42)

    # predict
    y_pred = model.predict(X_test)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Pos RMSLE: {err}")

Votes Pos RMSLE: 0.9482111433566583
Votes Pos RMSLE: 0.9014706238109841
Votes Pos RMSLE: 0.8807113611038683
Votes Pos RMSLE: 0.8706016822113462


In [15]:
n = -1
exvars = votes_pos_varset[n]
model = votes_pos_models[n]
pred_votes_pos_xgb = pd.DataFrame({'article': full_data['article'],
                                    'comment_id': full_data['comment_id'],
                                    'pred_votes_pos': model.predict(df_scaled[exvars])}) 
pred_votes_pos_xgb.to_csv('model_output/xgbs/pred_votes_pos_xgb.csv')

In [7]:
with open('model_output/xgbs/xgb_votes_neg_models.pkl', 'rb') as f:
    votes_neg_models = pickle.load(f)

In [12]:
for n, model in enumerate(votes_neg_models):
    exvars = votes_neg_varset[n]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_neg'],
                                                        test_size=0.2, random_state=42)

    # predict
    y_pred = model.predict(X_test)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Neg RMSLE: {err}")

Votes Neg RMSLE: 0.7718238067198171
Votes Neg RMSLE: 0.7615986671149627
Votes Neg RMSLE: 0.7038701826725169
Votes Neg RMSLE: 0.6725097834509901


In [10]:
n = -1
exvars = votes_neg_varset[n]
model = votes_neg_models[n]
pred_votes_neg_xgb = pd.DataFrame({'article': full_data['article'],
                                    'comment_id': full_data['comment_id'],
                                    'pred_votes_neg': model.predict(df_scaled[exvars])})
pred_votes_neg_xgb.to_csv('model_output/xgbs/pred_votes_neg_xgb.csv')