In [1]:
!nvidia-smi

Sun Mar  2 08:07:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:4E:00.0 Off |                    0 |
| N/A   30C    P0             55W /  400W |       1MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os
import torch

import numpy as np
import pandas as pd
import pytorch_lightning as pl

from codealltag_data_processor_v2025 import CodealltagDataProcessor
from pandas import DataFrame
from tqdm import tqdm
from transformers import MT5ForConditionalGeneration, MT5TokenizerFast
from typing import Any, Dict, List, Tuple

  warn(f"Failed to load image Python extension: {e}")


In [3]:
cdp_2022 = CodealltagDataProcessor(data_version='20220513', config_path=['codealltag_data_processor.yml'])
cdp_2020 = CodealltagDataProcessor(data_version='20200518', config_path=['codealltag_data_processor.yml'])

In [4]:
sample_sizes: List[int] = [1000 * item for item in range(3, 10+1)]
used_file_paths: List[str] = list()
for sample_size in sample_sizes:
    dataset = cdp_2022.get_train_dev_test_datasetdict_for_sample_size(cdp_2020, sample_size, 1)
    train_df = dataset["train"].to_pandas()
    dev_df = dataset["dev"].to_pandas()
    test_df = dataset["test"].to_pandas()
    used_file_paths.extend(train_df.FilePath.tolist())
    used_file_paths.extend(dev_df.FilePath.tolist())
    used_file_paths.extend(test_df.FilePath.tolist())

In [5]:
len(set(used_file_paths))

35195

In [6]:
category_path_df = cdp_2022.get_category_path_df()
category_path_df.shape[0]

1468942

In [7]:
unused_category_path_df = category_path_df[
    ~category_path_df.FilePath.isin(used_file_paths)
]
unused_category_path_df.shape[0]

1433747

In [8]:
unused_category_path_df = unused_category_path_df[
    unused_category_path_df.FileSize <= cdp_2022.get_max_file_size()
]
unused_category_path_df.shape[0]

1243594

In [9]:
cdp_2022.get_category_or_label_wise_count_or_ratio(unused_category_path_df)

{'TRAVELS': 134127,
 'PHILOSOPHY': 137344,
 'FINANCE': 140041,
 'MOVIES': 173947,
 'EVENTS': 209331,
 'GERMAN': 223174,
 'TEENS': 225630}

In [10]:
annotation_df = cdp_2022.get_annotation_df()

In [11]:
unused_annotation_df = annotation_df[
    annotation_df.FilePath.isin(
        unused_category_path_df.FilePath.str.replace('.txt', '.ann', regex=True)
    )
]

In [12]:
cdp_2022.get_category_or_label_wise_count_or_ratio(unused_annotation_df, category_wise=False)

{'ZIP': 12047,
 'STREET': 12812,
 'STREETNO': 14103,
 'USER': 20475,
 'PHONE': 20640,
 'UFID': 33207,
 'FEMALE': 217537,
 'DATE': 244550,
 'CITY': 306691,
 'EMAIL': 383949,
 'ORG': 392836,
 'URL': 465147,
 'FAMILY': 855476,
 'MALE': 1474664}

In [13]:
female_fps = set(unused_annotation_df[unused_annotation_df.Label=='FEMALE'].FilePath.tolist())
phone_fps = set(unused_annotation_df[unused_annotation_df.Label=='PHONE'].FilePath.tolist())
street_fps = set(unused_annotation_df[unused_annotation_df.Label=='STREET'].FilePath.tolist())
streetno_fps = set(unused_annotation_df[unused_annotation_df.Label=='STREETNO'].FilePath.tolist())
ufid_fps = set(unused_annotation_df[unused_annotation_df.Label=='UFID'].FilePath.tolist())
user_fps = set(unused_annotation_df[unused_annotation_df.Label=='USER'].FilePath.tolist())
zip_fps = set(unused_annotation_df[unused_annotation_df.Label=='ZIP'].FilePath.tolist())

In [14]:
female_fps_5k = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    list(female_fps), size=5000, replace=False
).tolist()

phone_fps_2k = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    list(phone_fps), size=2000, replace=False
).tolist()

street_fps_2k = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    list(street_fps), size=2000, replace=False
).tolist()

streetno_fps_2k = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    list(streetno_fps), size=2000, replace=False
).tolist()

user_fps_2k = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    list(user_fps), size=2000, replace=False
).tolist()

ufid_fps_2k = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    list(ufid_fps), size=2000, replace=False
).tolist()

zip_fps_2k = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    list(zip_fps), size=2000, replace=False
).tolist()

In [15]:
aggregated_list = list(set(
    phone_fps_2k+street_fps_2k+streetno_fps_2k+user_fps_2k+ufid_fps_2k+zip_fps_2k+female_fps_5k
))

In [16]:
len(aggregated_list)

15705

In [17]:
aggregated_annotation_df = annotation_df[annotation_df.FilePath.isin(aggregated_list)]

In [18]:
len(set(aggregated_annotation_df.FilePath.tolist()))

15705

In [19]:
cdp_2022.get_category_or_label_wise_count_or_ratio(aggregated_annotation_df, category_wise=False)

{'USER': 3455,
 'DATE': 4330,
 'STREET': 5084,
 'ZIP': 5102,
 'UFID': 5144,
 'STREETNO': 5892,
 'PHONE': 6688,
 'FEMALE': 7391,
 'ORG': 10116,
 'EMAIL': 10240,
 'URL': 11603,
 'CITY': 11886,
 'FAMILY': 16773,
 'MALE': 24113}

In [20]:
selected_15K = np.random.RandomState(
    cdp_2022.get_random_seed()
).choice(
    aggregated_list, size=15000, replace=False
).tolist()

In [21]:
len(selected_15K)

15000

In [22]:
selected_annotation_df = annotation_df[annotation_df.FilePath.isin(selected_15K)]

In [23]:
cdp_2022.get_category_or_label_wise_count_or_ratio(selected_annotation_df, category_wise=False)

{'USER': 3304,
 'DATE': 4094,
 'STREET': 4875,
 'ZIP': 4877,
 'UFID': 4904,
 'STREETNO': 5650,
 'PHONE': 6388,
 'FEMALE': 7062,
 'ORG': 9614,
 'EMAIL': 9768,
 'URL': 11070,
 'CITY': 11362,
 'FAMILY': 15954,
 'MALE': 22974}

In [24]:
selected_category_path_df = category_path_df[
    category_path_df.FilePath.isin(
        selected_annotation_df.FilePath.str.replace('.ann', '.txt', regex=True)
    )
]

In [25]:
selected_category_path_df.shape[0]

15000

In [26]:
cdp_2022.get_category_or_label_wise_count_or_ratio(selected_category_path_df)

{'PHILOSOPHY': 1302,
 'GERMAN': 1754,
 'TRAVELS': 2057,
 'MOVIES': 2288,
 'TEENS': 2360,
 'FINANCE': 2469,
 'EVENTS': 2770}

In [27]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [28]:
cache_dir = os.path.join(*['/home', 's81481', '.huggingface'])

In [29]:
class LightningModel(pl.LightningModule):
    
    def __init__(self, hparam):
        super(LightningModel, self).__init__()
        self.hparam = hparam
        self.model = MT5ForConditionalGeneration.from_pretrained(hparam.model_name_or_path, cache_dir=cache_dir)
        self.tokenizer = MT5TokenizerFast.from_pretrained(hparam.model_name_or_path, cache_dir=cache_dir)

    def forward(self, 
                input_ids,
                attention_mask=None,
                decoder_input_ids=None,
                decoder_attention_mask=None,
                labels=None):
        
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

In [30]:
model_dir = f"logs/mT5/NER-PG/10K/k3/version_0/checkpoints/"
ckpt_name = next(iter(os.listdir(model_dir)), None)
model_path = os.path.join(model_dir, ckpt_name); model_path

'logs/mT5/NER-PG/10K/k3/version_0/checkpoints/epoch=03-step=00003-val_loss=1.3927.ckpt'

In [31]:
lightning_model = LightningModel.load_from_checkpoint(model_path)

/home/s81481/pseugc/lib/python3.9/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  return torch.load(checkpoint_file, map_location="cp

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

In [32]:
def predict(lightning_model: LightningModel, input_text: str):
    
    model = lightning_model.model
    tokenizer = lightning_model.tokenizer
    
    tokenized_outputs = tokenizer.batch_encode_plus(
        [input_text],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = tokenized_outputs["input_ids"]
    attention_mask = tokenized_outputs["attention_mask"]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    model.to(device)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    outs = model.generate(input_ids=input_ids,
                          attention_mask=attention_mask,
                          max_length=512,
                          temperature=0.8,
                          do_sample=True,
                          top_k=100)
    dec = [
        tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
        for ids in outs
    ]

    return dec[0]

In [43]:
def create_predicted_text_df(df: DataFrame, lightning_model: LightningModel, repeat: int = 5) -> None:
    tuples: List[Tuple] = list()
    df_count: int = 0
    with tqdm(total=len(df), position=0, leave=True) as progress_bar:
        for idx, row in df.iterrows():
            row_items: List[str] = list()
            file_path = row.FilePath
            row_items.append(file_path)
            input_text = cdp_2022.read_email(file_path)[1]
            for repeat_num in range(0, repeat):
                generated_text = predict(lightning_model=lightning_model, input_text=input_text)
                row_items.append(generated_text)
            tuples.append(tuple(row_items))
            progress_bar.update(1)
            
            if len(tuples) % 3000 == 0:
                df_count += 1
                output_df = pd.DataFrame(tuples, columns=["FilePath", *[f'V{item+1}' for item in range(0, repeat)]])
                output_df.to_csv(f'PredictedText_DF_{cdp_2022.get_data_version()}_15K_{df_count}.csv')
                tuples = list()

In [None]:
create_predicted_text_df(df=selected_category_path_df, lightning_model=lightning_model)

 88%|████████████████████████████    | 13132/15000 [32:29:44<3:29:32,  6.73s/it]