In [3]:
from transformers import BertTokenizer
from transformers import MarianMTModel, MarianTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from rich import print
from rich.table import Table
from rich.console import Console
import shutil
import torch
import sacremoses
import sentencepiece
from tqdm.auto import tqdm

# **Preprocessing**

<li>Stop Words Removal</li>
<li>Case Folding</li>


In [None]:
df = pd.read_csv("translated_data.csv")
tiket_df = pd.DataFrame(df)
tqdm.pandas()
stop_words = set(stopwords.words("english"))
punctuation = set(string.punctuation)


def clean_text(text):
    if not isinstance(text, str):
        return print(f"Not cleaned: {""}")
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [
        word for word in tokens if word not in stop_words and word not in punctuation
    ]
    return " ".join(cleaned_tokens)


tiket_df["cleaned_text"] = tiket_df["translated_body"].progress_apply(clean_text)
display(tiket_df[["cleaned_text", "translated_body"]].head(15))

 88%|████████▊ | 17643/20000 [00:03<00:00, 4505.04it/s]

100%|██████████| 20000/20000 [00:04<00:00, 4521.26it/s]


Unnamed: 0,cleaned_text,translated_body
0,data analysis platform unexpectedly broke memo...,The data analysis platform unexpectedly broke ...
1,seeking information digital strategies aid bra...,Seeking information on digital strategies that...
2,contacting request information data analytics ...,I am contacting you to request information on ...
3,media data blocking behavior occurred due unau...,A media data blocking behavior occurred due to...
4,dear customer support reaching inquire securit...,"Dear Customer Support, I am reaching out to in..."
5,inquiring best practices securing medical data...,Inquiring about best practices for securing me...
6,advice whether possible backup medical data hu...,Advice on whether it is possible to backup med...
7,integration stopped working unexpectedly causi...,"The integration stopped working unexpectedly, ..."
8,dear customer support writing regards recently...,"Dear Customer Support, I am writing in regards..."
9,latest data analysis reports inconsistent pote...,The latest data analysis reports are inconsist...


# **BERT CLS Tokenization**


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_output = tokenizer(
    list(tiket_df["cleaned_text"].astype(str).tolist()),
    padding=True,
    truncation=True,
    return_tensors="pt",
)

print("Input IDs (first 5 samples):")
tiket_df["tokenized_body"] = tokenized_output["input_ids"].tolist()
display(tiket_df[["tokenized_body", "cleaned_text"]].head(5))
tiket_df.to_csv("translated_data.csv", index=False)

Unnamed: 0,tokenized_body,cleaned_text
0,"[101, 2951, 4106, 4132, 14153, 3631, 3638, 827...",data analysis platform unexpectedly broke memo...
1,"[101, 6224, 2592, 3617, 9942, 4681, 4435, 3930...",seeking information digital strategies aid bra...
2,"[101, 3967, 2075, 5227, 2592, 2951, 25095, 590...",contacting request information data analytics ...
3,"[101, 2865, 2951, 10851, 5248, 4158, 2349, 246...",media data blocking behavior occurred due unau...
4,"[101, 6203, 8013, 2490, 4285, 1999, 15549, 289...",dear customer support reaching inquire securit...


['[CLS]',
 'seeking',
 'information',
 'digital',
 'strategies',
 'aid',
 'brand',
 'growth',
 'details',
 'available',
 'services',
 'looking',
 'forward',
 'learning',
 'help',
 'business',
 'grow',
 'thank',
 'look',
 'forward',
 'hearing',
 'soon',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]