In [1]:
!python -m pip install xgboost



In [1]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [3]:
train_dataset = 'charlie_hebdo'
test_dataset = 'germanwings_crash'
time_cut =(60*2*24)+15
processor = Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning(train_dataset,\
           test_dataset, time_cut=time_cut,test_size=0.7)

processor.load_data()
processor.process_data()
train,test = processor.get_final_dataframes()

rumour
0    127
1     70
Name: count, dtype: int64


In [28]:
X_train  = train.drop(columns=['rumour'])
X_train = np.hstack([X_train.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))])
#X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
y_train =train['rumour']

X_test  = test.drop(columns=['rumour'])
X_test = np.hstack([X_test.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))])
#X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
y_test =test['rumour']

### SMOTE APPROACH

In [29]:

smote = SMOTE(random_state=42,sampling_strategy='minority')
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [30]:
print(y_train.sum(),y_train.shape)

522 (2122,)


In [31]:
print(y_test.sum(),y_test.shape)

70 (197,)


In [3]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score

def modelfit(alg, X_train, y_train, X_test, y_test, performCV=True, printFeatureImportance=True, cv_folds=5):
    # Fit the algorithm on the data
    #alg.fit(X_train, y_train)
        
    # Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]

    # Predict test set:
    dtest_predictions = alg.predict(X_test)
    dtest_predprob = alg.predict_proba(X_test)[:,1]
    
    # Perform cross-validation:
    if performCV:
        cv_score = cross_val_score(alg, X_train, y_train, cv=cv_folds, scoring='recall')
    
    # Print model report for Train:
    print("\nModel Report Train")
    print("AUC Score : %f" % roc_auc_score(y_train, dtrain_predprob))
    print("Accuracy : %.4g" % accuracy_score(y_train, dtrain_predictions))
    print("Precision : %.4g" % precision_score(y_train, dtrain_predictions))
    print("Recall : %f" % recall_score(y_train, dtrain_predictions))
    

    
    # Print model report for Test:
    print("\nModel Report Test")
    print("AUC Score : %f" % roc_auc_score(y_test, dtest_predprob))
    print("Accuracy : %.4g" % accuracy_score(y_test, dtest_predictions))
    print("Precision : %.4g" % precision_score(y_test, dtest_predictions))
    print("Recall : %f" % recall_score(y_test, dtest_predictions))
    

    
    plt.tight_layout()
    plt.show()
    
    if performCV:
        print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score)))

    return alg



In [33]:
best_params={
             'n_estimators': 50, 
             'max_depth': 3, 
             'learning_rate': 0.05, 
       }

In [34]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBClassifier(
    eval_metric='logloss',
    **best_params
)
final_model.fit(X_train, y_train)


In [35]:
modelfit(final_model,X_train, y_train,X_test,y_test)


Model Report Train
AUC Score : 0.950876
Accuracy : 0.885
Precision : 0.8188
Recall : 0.683908

Model Report Test
AUC Score : 0.723735
Accuracy : 0.7005
Precision : 0.5775
Recall : 0.585714


<Figure size 640x480 with 0 Axes>

CV Score : Mean - 0.3528022 | Std - 0.2498887 | Min - 0.1428571 | Max - 0.8173077


In [36]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBClassifier(
    eval_metric='logloss',
    **best_params
)
final_model.fit(X_resampled, y_resampled )


In [37]:
modelfit(final_model,X_resampled, y_resampled ,X_test,y_test)


Model Report Train
AUC Score : 0.958563
Accuracy : 0.8931
Precision : 0.8562
Recall : 0.945000

Model Report Test
AUC Score : 0.691676
Accuracy : 0.5635
Precision : 0.4365
Recall : 0.785714


<Figure size 640x480 with 0 Axes>

CV Score : Mean - 0.913125 | Std - 0.0453028 | Min - 0.825 | Max - 0.946875


#### Hyperparameter tunning

In [20]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define the hyperparameter distributions
param_dist = {
     'max_depth': [3, 4, 5, 6],
    'learning_rate': np.linspace(0.001, 0.1, 20),
    'n_estimators': [20,50, 100, 150, 200]
}

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=20, cv=5, scoring='f1')


In [21]:
# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_resampled, y_resampled)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

Best set of hyperparameters:  {'n_estimators': 100, 'max_depth': 6, 'learning_rate': np.float64(0.09478947368421052)}
Best score:  0.8591173425220567


In [38]:
best_params =  {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'learning_rate': 0.05,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'min_child_weight': 1,
    'lambda': 1,
    'alpha': 0.5
}

In [39]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBClassifier(
    eval_metric='logloss',
    **best_params
)
final_model.fit(X_resampled, y_resampled)


In [40]:
modelfit(final_model,X_resampled, y_resampled,X_test,y_test)


Model Report Train
AUC Score : 0.998509
Accuracy : 0.9744
Precision : 0.9567
Recall : 0.993750

Model Report Test
AUC Score : 0.722272
Accuracy : 0.6142
Precision : 0.4732
Recall : 0.757143


<Figure size 640x480 with 0 Axes>

CV Score : Mean - 0.95375 | Std - 0.03974528 | Min - 0.884375 | Max - 0.990625


In [2]:
def modelfit(alg, X_train, y_train, X_test, y_test, printFeatureImportance=True):
    with mlflow.start_run():
        # Fit the algorithm on the data
        #alg.fit(X_resampled, y_resampled)
            
        # Predict training set:
        dtrain_predictions = alg.predict(X_train)
        dtrain_predprob = alg.predict_proba(X_train)[:, 1]

        # Predict test set:
        dtest_predictions = alg.predict(X_test)
        dtest_predprob = alg.predict_proba(X_test)[:, 1]

        # Log model parameters
        mlflow.log_params(alg.get_params())

        # Log metrics for Train
        mlflow.log_metric("train_auc", roc_auc_score(y_train, dtrain_predprob))
        mlflow.log_metric("train_accuracy", accuracy_score(y_train, dtrain_predictions))
        mlflow.log_metric("train_precision", precision_score(y_train, dtrain_predictions))
        mlflow.log_metric("train_recall", recall_score(y_train, dtrain_predictions))

        # Log metrics for Test
        mlflow.log_metric("test_auc", roc_auc_score(y_test, dtest_predprob))
        mlflow.log_metric("test_accuracy", accuracy_score(y_test, dtest_predictions))
        mlflow.log_metric("test_precision", precision_score(y_test, dtest_predictions))
        mlflow.log_metric("test_recall", recall_score(y_test, dtest_predictions))

        # Print model report for Train
        print("\nModel Report Train")
        print("AUC Score : %f" % roc_auc_score(y_train, dtrain_predprob))
        print("Accuracy : %.4g" % accuracy_score(y_train, dtrain_predictions))
        print("Precision : %.4g" % precision_score(y_train, dtrain_predictions))
        print("Recall : %f" % recall_score(y_train, dtrain_predictions))

      

        # Print model report for Test
        print("\nModel Report Test")
        print("AUC Score : %f" % roc_auc_score(y_test, dtest_predprob))
        print("Accuracy : %.4g" % accuracy_score(y_test, dtest_predictions))
        print("Precision : %.4g" % precision_score(y_test, dtest_predictions))
        print("Recall : %f" % recall_score(y_test, dtest_predictions))


        # Log the model with input example
        #input_example = X_train[0:1]   # Use the first row of X_train as an example
        #mlflow.sklearn.log_model(alg, "model", input_example=input_example)
        mlflow.log_metric("time_cut", time_cut)
        mlflow.log_metric("test_size", test_size)
    
    # Explicitly end the MLflow run
    mlflow.end_run()

    return alg



In [3]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("Xgboost SMOTE Filter Node 2025-03-01 Transfer Learning ferguson")

2025/03/01 16:54:02 INFO mlflow.tracking.fluent: Experiment with name 'Xgboost SMOTE Filter Node 2025-03-01 Transfer Learning ferguson' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/rumour-detection-gnn/mlruns/39', creation_time=1740848042136, experiment_id='39', last_update_time=1740848042136, lifecycle_stage='active', name='Xgboost SMOTE Filter Node 2025-03-01 Transfer Learning ferguson', tags={}>

In [4]:
best_params =  {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'learning_rate': 0.05,
    'n_estimators': 100,

}

In [5]:
train_dataset = 'charlie_hebdo'
test_dataset = 'ferguson'
test_size=0.5

for time_cut in range(260,24*6*60,60):
    print(time_cut)
    
    processor = Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning(train_dataset,\
           test_dataset, time_cut=time_cut,test_size=test_size)
    
    processor.load_data()
    processor.process_data()
    train,test = processor.get_final_dataframes()

    X_train  = train.drop(columns=['rumour'])
    X_train = np.hstack([X_train.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))])
    #X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
    y_train =train['rumour']
    
    X_test  = test.drop(columns=['rumour'])
    X_test = np.hstack([X_test.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))])
    #X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
    y_test =test['rumour']
    
    smote = SMOTE(random_state=42,sampling_strategy='minority')
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    final_model = xgb.XGBClassifier(
        eval_metric='logloss',
        **best_params
        )
        
    final_model.fit(X_resampled, y_resampled)  
        
    modelfit(final_model,  X_resampled, y_resampled,X_test,y_test)

260
rumour
0    38
1     1
Name: count, dtype: int64

Model Report Train
AUC Score : 0.997521
Accuracy : 0.9718
Precision : 0.9521
Recall : 0.993561

Model Report Test
AUC Score : 0.184211
Accuracy : 0.9231
Precision : 0
Recall : 0.000000
320
rumour
0    38
1     1
Name: count, dtype: int64

Model Report Train
AUC Score : 0.997521
Accuracy : 0.9718
Precision : 0.9521
Recall : 0.993561

Model Report Test
AUC Score : 0.184211
Accuracy : 0.9231
Precision : 0
Recall : 0.000000
380
rumour
0    38
1     1
Name: count, dtype: int64

Model Report Train
AUC Score : 0.997521
Accuracy : 0.9718
Precision : 0.9521
Recall : 0.993561

Model Report Test
AUC Score : 0.184211
Accuracy : 0.9231
Precision : 0
Recall : 0.000000
440
rumour
0    38
1     1
Name: count, dtype: int64

Model Report Train
AUC Score : 0.997521
Accuracy : 0.9718
Precision : 0.9521
Recall : 0.993561

Model Report Test
AUC Score : 0.184211
Accuracy : 0.9231
Precision : 0
Recall : 0.000000
500
rumour
0    38
1     1
Name: count, dtyp