In [None]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, root_mean_squared_log_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import root_mean_squared_log_error
import pickle

import numpy as np
import xgboost as xgb
from typing import Tuple

In [None]:
# df_pinned_scaled = pd.read_csv('data/data_pin_scaled.csv', index_col=0)
df_scaled = pd.read_csv('data/data_scaled.csv', index_col=0)

In [None]:
# df_pinned_scaled = pd.get_dummies(df_pinned_scaled, columns=['genre1'], drop_first=True)
df_scaled = pd.get_dummies(df_scaled, columns=['genre1'], drop_first=True)


In [None]:
prevars = ['sentiment_prob_pos', 'sentiment_prob_neg', 'lexdiv_cttr', 'num_punct',
           'num_sentences', 'SMOG_readability', 'second_person', 'user_follower',
           'cosine_1', 'hours_since_article', 'votes_pos_mean', 'votes_neg_mean',
           'article_comments', 'is_root_comment', 'level_in_tree'] + [x for x in df_scaled.columns if 'genre1' in x]
pinned_var = ['pinned_f']
engagevars = ['is_leaf_comment', 'size_of_tree', 'height_of_tree', 'all_replies']
votes_vars = ['votes_neg_log', 'votes_pos_log']

pinned_varset = [prevars, prevars+engagevars, prevars+engagevars+votes_vars]
votes_pos_varset = [prevars, prevars+pinned_var, prevars+pinned_var+engagevars,
                    prevars+pinned_var+engagevars+votes_vars[:1]]
votes_neg_varset = [prevars, prevars+pinned_var, prevars+pinned_var+engagevars,
                    prevars+pinned_var+engagevars+votes_vars[1:]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[pinned_varset[2]], df_scaled['pinned_f'], test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")


In [None]:
#  pickle models
with open('models/xgb_pinned_models.pkl', 'rb') as f:
    models = pickle.load(f)
print(len(models))

In [None]:
# models = []

for exvars in pinned_varset[len(models):]:

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['pinned_f'],
                                                        test_size=0.2, random_state=42)

    # cross validated random parameter search
    xgb_model = xgb.XGBClassifier()
    params = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}
    rs = RandomizedSearchCV(xgb_model, params, n_iter=20, n_jobs=-1, cv=5, verbose=10)
    rs.fit(X_train, y_train)

    # do grid search around best parameters from random search
    best_params = rs.best_params_
    print(f"Best parameters: {best_params}")
    del rs
    params = {
        'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
        'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
        'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
    gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
    gs.fit(X_train, y_train)

    # predict
    y_pred = gs.predict(X_test)

    # save model
    models.append(gs)

    # evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")


In [None]:
exvars = pinned_varset[2]

# train test split
X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                    df_scaled['pinned_f'],
                                                    test_size=0.2, random_state=42)

# cross validated random parameter search
xgb_model = xgb.XGBClassifier()

best_params = {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.2}

params = {
    'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
    'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
    'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
gs.fit(X_train, y_train)

# predict
y_pred = gs.predict(X_test)

# save model
models.append(gs)

# evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

In [None]:
#  pickle models
with open('models/xgb_pinned_models.pkl', 'wb') as f:
    pickle.dump(models, f)

In [None]:
# setup RMSLE loss

def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((-np.log1p(predt) + np.log1p(y) + 1) /
            np.power(predt + 1, 2))

def squared_log(predt: np.ndarray,
                dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    '''Squared Log Error objective. A simplified version for RMSLE used as
    objective function.
    '''
    predt[predt < -1] = -1 + 1e-6
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[votes_pos_varset[-1]], df_scaled['votes_pos'], test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
print(f"Votes Pos RMSLE: {err}")

X_train, X_test, y_train, y_test = train_test_split(df_scaled[votes_neg_varset[-1]], df_scaled['votes_neg'], test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
print(f"Votes Neg RMSLE: {err}")

In [None]:
with open('models/xgb_votes_pos_models.pkl', 'rb') as f:
    votes_pos_models = pickle.load(f)
print(len(votes_pos_models))
# votes_pos_models = []

for exvars in votes_pos_varset[len(votes_pos_models):]:

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_pos'],
                                                        test_size=0.2, random_state=42)

    # cross validated random parameter search
    xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
    params = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}
    rs = RandomizedSearchCV(xgb_model, params, n_iter=20, n_jobs=-1, cv=5, verbose=10)
    rs.fit(X_train, y_train)

    # do grid search around best parameters from random search
    best_params = rs.best_params_
    print('Best parameters:', best_params)
    del rs
    params = {
        'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
        'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
        'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
    gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
    gs.fit(X_train, y_train)

    # predict
    y_pred = gs.predict(X_test)

    # save model
    votes_pos_models.append(gs)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Pos RMSLE: {err}")


In [None]:
with open('models/xgb_votes_pos_models.pkl', 'rb') as f:
    votes_pos_models = pickle.load(f)
print(len(votes_pos_models))
    
exvars = votes_pos_varset[len(votes_pos_models)]
X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                    df_scaled['votes_pos'],
                                                    test_size=0.2, random_state=42)

# cross validated random parameter search
xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)

best_params = {'n_estimators': 900, 'max_depth': 7, 'learning_rate': 0.05}

params = {
    'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
    'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
    'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
gs.fit(X_train, y_train)

# predict
y_pred = gs.predict(X_test)

# save model
votes_pos_models.append(gs)

In [None]:
# pickle models
with open('models/xgb_votes_pos_models.pkl', 'wb') as f:
    pickle.dump(votes_pos_models, f)

In [None]:
with open('models/xgb_votes_neg_models.pkl', 'rb') as f:
    votes_neg_models = pickle.load(f)

# votes_neg_models = []

print(len(votes_neg_models))


for exvars in votes_neg_varset[len(votes_neg_models):]:

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_neg'],
                                                        test_size=0.2, random_state=42)

    # cross validated random parameter search
    xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)
    params = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}
    rs = RandomizedSearchCV(xgb_model, params, n_iter=20, n_jobs=-1, cv=5, verbose=10)
    rs.fit(X_train, y_train)

    # do grid search around best parameters from random search
    best_params = rs.best_params_
    print(best_params)
    params = {
        'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
        'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
        'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
    del rs
    gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
    gs.fit(X_train, y_train)

    # predict
    y_pred = gs.predict(X_test)

    # save model
    votes_neg_models.append(gs)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Neg RMSLE: {err}")

In [None]:

with open('models/xgb_votes_neg_models.pkl', 'rb') as f:
    votes_neg_models = pickle.load(f)
print(len(votes_neg_models))
    
exvars = votes_neg_varset[len(votes_neg_models)]

# train test split
X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                    df_scaled['votes_neg'],
                                                    test_size=0.2, random_state=42)

# cross validated random parameter search
xgb_model = xgb.XGBRegressor(obj=squared_log, custom_metric=rmsle, eval_metric=rmsle, disable_default_eval_metric=1)

# do grid search around best parameters from random search
best_params = {'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.05}
print(best_params)
params = {
    'max_depth': [best_params['max_depth']-1, best_params['max_depth'], best_params['max_depth']+1],
    'learning_rate': [best_params['learning_rate']*0.8, best_params['learning_rate'], best_params['learning_rate']*1.2],
    'n_estimators': [best_params['n_estimators']-100, best_params['n_estimators'], best_params['n_estimators']+100]}
gs = GridSearchCV(xgb_model, params, n_jobs=-1, cv=5, verbose=10)
gs.fit(X_train, y_train)

# predict
y_pred = gs.predict(X_test)

# save model
votes_neg_models.append(gs)

# evaluate
err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
print(f"Votes Neg RMSLE: {err}")

In [None]:
# pickle models
with open('models/xgb_votes_neg_models.pkl', 'wb') as f:
    pickle.dump(votes_neg_models, f)

## Output predictions for final models

In [None]:
full_data = pd.read_csv('data/comment_data_general_091022_untilarticle5874_final_redacted.csv', index_col=0)

In [None]:
with open('models/xgb_pinned_models.pkl', 'rb') as f:
    models = pickle.load(f)

In [None]:
n = -1
exvars = pinned_varset[n]
model = models[n]
pred_pinned_xgb = pd.DataFrame({'article': full_data['article'],
                                'comment_id': full_data['comment_id'],
                                'pred_pinned': model.predict_proba(df_scaled[exvars])[:,1]}) 
pred_pinned_xgb.to_csv('model_output/xgbs/pred_pinned_xgb.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled[pinned_varset[2]], df_scaled['pinned_f'], test_size=0.2, random_state=42)

xgb_model = models[-1]
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")


In [None]:
with open('models/xgb_votes_pos_models.pkl', 'rb') as f:
    votes_pos_models = pickle.load(f)

In [None]:
for n, model in enumerate(votes_pos_models):
    exvars = votes_pos_varset[n]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_pos'],
                                                        test_size=0.2, random_state=42)

    # predict
    y_pred = model.predict(X_test)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Pos RMSLE: {err}")

In [None]:
n = -1
exvars = votes_pos_varset[n]
model = votes_pos_models[n]
pred_votes_pos_xgb = pd.DataFrame({'article': full_data['article'],
                                    'comment_id': full_data['comment_id'],
                                    'pred_votes_pos': model.predict(df_scaled[exvars])}) 
pred_votes_pos_xgb.to_csv('model_output/xgbs/pred_votes_pos_xgb.csv')

In [None]:
with open('model_output/xgbs/xgb_votes_neg_models.pkl', 'rb') as f:
    votes_neg_models = pickle.load(f)

In [None]:
for n, model in enumerate(votes_neg_models):
    exvars = votes_neg_varset[n]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df_scaled[exvars],
                                                        df_scaled['votes_neg'],
                                                        test_size=0.2, random_state=42)

    # predict
    y_pred = model.predict(X_test)

    # evaluate
    err = root_mean_squared_log_error(y_test, np.clip(y_pred, 0, None))
    print(f"Votes Neg RMSLE: {err}")

In [None]:
n = -1
exvars = votes_neg_varset[n]
model = votes_neg_models[n]
pred_votes_neg_xgb = pd.DataFrame({'article': full_data['article'],
                                    'comment_id': full_data['comment_id'],
                                    'pred_votes_neg': model.predict(df_scaled[exvars])})
pred_votes_neg_xgb.to_csv('model_output/xgbs/pred_votes_neg_xgb.csv')