In [1]:
%%capture
!pip install mlflow dagshub wandb lime

In [2]:
import os
import numpy as np

import pandas as pd
pd.set_option('display.max_colwidth', None)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from wordcloud import WordCloud
from transformers import BertTokenizer, BertModel, AutoTokenizer
import nltk
import torch
from torch.utils.data import Dataset, DataLoader
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , classification_report
import dagshub
import wandb
import mlflow
import mlflow.sklearn
import mlflow.pytorch
from lime.lime_text import LimeTextExplainer

# import google drive
from google.colab import drive, files
drive.mount('/content/drive')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

"""Import the modules from google colab"""
import sys
PATH_helper = "/content/drive/MyDrive/turing_college_experiments/16/helper"
sys.path.append(PATH_helper)

# Custom imports
from helper_model import (CustomDataset, custom_collate_fn,
                          pytorch_metrics_calculations,
                          train_model, evaluate_model, TextChunker,
                          tokenize_chunk, predictions_model_value)

"""
Verify the cores and gpu in the colab
"""
num_cores = os.cpu_count()  # Get the number of CPU cores
print(f'Number of available CPU cores: {num_cores}')
NUM_WORKERS = num_cores - 1
print(f'Setting num_workers to: {NUM_WORKERS}')

num_gpus_colab = torch.cuda.device_count()
GPU_LIST = [torch.cuda.get_device_name(i) for i in range(num_gpus_colab)]
NUM_WORKERS, GPU_LIST

enable_code_paths = True
enable_block = False

# weight and biases
# wandb.login()

# ml versioning control
dagshub.init(repo_owner='auszed', repo_name='Experiments_fake_news', mlflow=True)

# tracking experiments
os.environ['MLFLOW_TRACKING_USERNAME'] = 'auszed'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '4390ce1796189f46f62a5e643463dcaffede3ebf'
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/auszed/Experiments_fake_news.mlflow'

# Enable code block
enable_code = False

# Inline plotting for Jupyter Notebooks
%matplotlib inline
custom_colors = ['#36CE8A', "#7436F5","#3736F4",   "#36AEF5", "#B336F5", "#f8165e", "#36709A",  "#3672F5", "#7ACE5D"]
gradient_colors = [ "#36CE8A", '#7436F5']
color_palette_custom  = sns.set_palette(custom_colors)
theme_color = sns.color_palette(color_palette_custom, 9)
cmap_theme = LinearSegmentedColormap.from_list('custom_colormap', gradient_colors)

# File paths
URL_save_XAI = "/content/drive/MyDrive/turing_college_experiments/16/experiment_XAI/"
X_train = pd.read_csv("/content/drive/MyDrive/turing_college_experiments/16/dataset/X_train_cleaned_2.csv")
y_train = pd.read_csv("/content/drive/MyDrive/turing_college_experiments/16/dataset/y_train.csv")
X_val = pd.read_csv("/content/drive/MyDrive/turing_college_experiments/16/dataset/X_val_cleaned_2.csv")
y_val = pd.read_csv("/content/drive/MyDrive/turing_college_experiments/16/dataset/y_val.csv")
X_test = pd.read_csv("/content/drive/MyDrive/turing_college_experiments/16/dataset/X_test_cleaned_2.csv")
y_test = pd.read_csv("/content/drive/MyDrive/turing_college_experiments/16/dataset/y_test.csv")

X_train = X_train[["title_text_join"]]
X_val = X_val[["title_text_join"]]
X_test = X_test[["title_text_join"]]

print(X_train.shape,y_train.shape )
print(X_val.shape,y_val.shape )
print(X_test.shape,y_test.shape )

theme_color

Mounted at /content/drive
Number of available CPU cores: 2
Setting num_workers to: 1


Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=d3a8b2f3-ecba-4d85-b246-efd2ad87a8a0&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=243e1e5f6bdb333895e866c146eab9e200400556bc036f3153cebb592e06a046




(26759, 1) (26759, 1)
(5734, 1) (5734, 1)
(5735, 1) (5735, 1)


In [3]:
MODEL_NAME = "roberta_model"
model_uri = f"models:/{MODEL_NAME}@production_nn"

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    # Attempt to load the model from MLflow
    print(f"Loading model from URI: {model_uri} on device: {device}")
    loaded_model = mlflow.pytorch.load_model(model_uri, map_location=device)
    print("Model successfully loaded.")
    print(loaded_model)
except Exception as e:
    print(f"Unexpected error while loading model: {e}")

# model_path = "/content/drive/MyDrive/turing_college_experiments/16/model_save/roberta-base_008.pth"
# loaded_model = torch.load(model_path, map_location=device)

# import wandb
# run = wandb.init()
# artifact = run.use_artifact('hannsflip-none/Fake news comments/roberta-base_artifact:v1', type='model')
# artifact_dir = artifact.download()
loaded_model

Loading model from URI: models:/roberta_model@production_nn on device: cuda


Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Model successfully loaded.
ClassifierModel(
  (pretrained_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, b

ClassifierModel(
  (pretrained_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [4]:
import time
import torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# Replace `CustomDataset`, `custom_collate_fn`, `evaluate_model`, `loaded_model`, `device`, `THRESHOLD_PROBABILITIES_MODEL`, and `NUM_WORKERS`
# with the actual imports or definitions in your script.

# Model configuration
MODEL_NAME = "roberta-base"
BATCH_SIZE = 8
MAX_TOKENS = 512
ITERATION_SAMPLES = 100
THRESHOLD_PROBABILITIES_MODEL = 0.5
X_val = X_val[0:1]
y_val = y_val[0:1]

# Load tokenizer
tokenizer_model = AutoTokenizer.from_pretrained(MODEL_NAME)

# Prepare validation dataset and dataloader
dataset_loader_X_val = CustomDataset(
    dataframe=X_val,
    comment_str_title='title_text_join',
    target_variable=y_val,
    tokenizer=tokenizer_model,
    max_token_len=MAX_TOKENS
)

val_loader = DataLoader(
    dataset=dataset_loader_X_val,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=custom_collate_fn,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    drop_last=True
)

# Inference time calculation
total_time = 0
for _ in range(ITERATION_SAMPLES):
    start_time = time.time()
    test_accuracy, test_f1, all_preds, all_labels, all_predict_prob, inputs_ids = evaluate_model(
        loaded_model, val_loader, device, THRESHOLD_PROBABILITIES_MODEL, 1
    )
    end_time = time.time()
    total_time += (end_time - start_time)

average_time = total_time / ITERATION_SAMPLES
time_per_sample = average_time / len(dataset_loader_X_val)

print(f"In {ITERATION_SAMPLES} iterations, it takes {total_time:.4f} seconds.")
print(f"Average time per iteration: {average_time:.4f} seconds.")
print(f"Time per sample: {time_per_sample:.4f} seconds.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In 100 iterations, it takes 5.9273 seconds.
Average time per iteration: 0.0593 seconds.
Time per sample: 0.0593 seconds.
