# **Library & Depedencies**


In [None]:
from transformers import BertTokenizer
from transformers import MarianMTModel, MarianTokenizer
import nltk
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from rich import print
from rich.table import Table
from rich.console import Console
import shutil
import torch
import sacremoses
import sentencepiece
from tqdm.auto import tqdm

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
terminal_width = shutil.get_terminal_size().columns

# **Data Information & Translate**

<li>Check Target (Type, Priority, Queue) Distribution </li>
<li>Translate De to EN</li>


In [None]:
data = pd.read_csv("multilang-ticket.csv")
tiket_df = pd.DataFrame(data)
console = Console(width=terminal_width)
console.rule("[bold white]Data Information & Representation")
display(tiket_df.head())
console.rule("[bold white]Data Info")
print("data info:")
display(tiket_df.info(5))
numeric_columns = tiket_df.select_dtypes(include=[np.number]).columns
categorical_columns = tiket_df.select_dtypes(include=["string", "object"]).columns
print(f"Data numerik mencakup: {numeric_columns.to_list()}")
print(f"Data kategorikal mencakup: {categorical_columns.to_list()}")

console.rule("[bold white]Null Handling")
print(tiket_df.isnull().sum())

# console.rule("[bold white]Split Data Bahasa Inggris")
# en_df = tiket_df[tiket_df["language"] == "en"]
# ed_df = pd.DataFrame(en_df)
# display(en_df[["language", "body"]].head())
# print(en_df["body"].count())
# console.rule("[bold white]Split Data Bahasa Belanda")
# de_df = tiket_df[tiket_df["language"] == "de"]
# display(de_df[["language", "body"]].head(5))
# print(de_df["body"].count())

# **Visualization of Target Data Distribution**


In [None]:
language_counts = tiket_df["language"].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(
    language_counts,
    labels=language_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=sns.color_palette("mako", len(language_counts)),
)
plt.title("Language Distribution")
plt.axis("equal")
plt.show()

type_counts = tiket_df["type"].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(
    type_counts,
    labels=type_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=sns.color_palette("mako", len(type_counts)),
)
plt.title("Type Distribution")
plt.axis("equal")
plt.show()

priority_counts = tiket_df["priority"].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(
    priority_counts,
    labels=priority_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=sns.color_palette("mako", len(priority_counts)),
)
plt.title("Priority Distribution")
plt.axis("equal")
plt.show()

queue_counts = tiket_df["queue"].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(
    queue_counts,
    labels=queue_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=sns.color_palette("mako", len(queue_counts)),
)
plt.title("Queue Distribution")
plt.axis("equal")
plt.show()

# **Terjemahkan Data "de" ke "en"**

Load Model Helsinki-NLP/Opust-mt-de-en


In [None]:
model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [None]:
tqdm.pandas()


def translate_german_to_english(text):
    if not isinstance(text, str):
        print(f"Skipping non-string input: {text}")
        return text
    try:
        tokenized_text = tokenizer(text, return_tensors="pt")
        translated = model.generate(**tokenized_text)
        english_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
        return english_text
    except Exception as e:
        print(f"Error translating text: '{text}' - {e}")
        return text


def process_body(row):
    if row["language"] == "de":
        return translate_german_to_english(row["body"])
    else:
        return row["body"]


tiket_df["translated_body"] = tiket_df.progress_apply(process_body, axis=1)
display(tiket_df[["body", "translated_body"]])
tiket_df.to_csv("translated_boy_multilang_ticket.csv", index=False)