# Libraries

In [9]:
import os
import json
import pandas as pd
from openai import OpenAI

pd.set_option('max_colwidth', None)

# Config

In [18]:
client = OpenAI(api_key=json.load(open('secrets.json'))['OPENAPI_SECRET_KEY'])

split = 'train'
data_path = f'data/public_data_dev/track_a/{split}/sun.csv'
back_translated_data_dir = 'data/back_translated_data_raw/track_a/sun'
data_sun2idn_path = os.path.join(back_translated_data_dir, f'{split}_sun2idn.csv')
data_sun2idn2sun_path = os.path.join(back_translated_data_dir, f'{split}_sun2idn2sun.csv')
final_data_path = os.path.join(back_translated_data_dir, f'{split}_final.csv')

print("Back-translated data directory:", back_translated_data_dir)
print("sun2idn data path:", data_sun2idn_path)
print("sun2idn2sun data path:", data_sun2idn2sun_path)
print("final data path:", final_data_path)

Back-translated data directory: data/back_translated_data_raw/track_a/sun
sun2idn data path: data/back_translated_data_raw/track_a/sun/train_sun2idn.csv
sun2idn2sun data path: data/back_translated_data_raw/track_a/sun/train_sun2idn2sun.csv
final data path: data/back_translated_data_raw/track_a/sun/train_final.csv


# Data

## Load Data

In [11]:
df = pd.read_csv(data_path)
print("DF size:", len(df))
df.head()

DF size: 924


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang salam bandung soreang,0,0,0,1,0,1
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. 😂 ku domba dai nyh,0,0,0,1,0,1
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus tingkatkeun skill babarengan mang :) KAPAN NIKAH ? :(",0,0,0,1,1,0
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun wkwk😅,0,0,0,1,0,1


## Translate Data from Sundanese to Indonesian

In [12]:
translate_sun2idn_prompt = """Terjemahkan teks bahasa Sunda ke bahasa Indonesia untuk keperluan dataset NLP

Teks bahasa Sunda: {text_sun}
Teks bahasa Indonesia: """

def translate_sun2idn(text_sun):
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            # {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                'content': translate_sun2idn_prompt.format(text_sun=text_sun),
            },
        ],
        max_tokens=100,
    )
    return completion.choices[0].message.content

for i, row in df.iterrows():
    print("=" * 64, "\n", i+1, "\n", "=" * 64)
    print("Teks bahasa Sunda:", row['text'])

    text_sun2idn = translate_sun2idn(row['text'])
    df.at[i, 'text_sun2idn'] = text_sun2idn

    print("Teks bahasa Indonesia:", text_sun2idn)
    print()

 1 
Teks bahasa Sunda: Kumaha mang fiksi engke ka sabilulungan mang salam bandung soreang
Teks bahasa Indonesia: Bagaimana nanti cerita fiksi ke komunitas, salam dari Bandung Soreang.

 2 
Teks bahasa Sunda: tapi domba anakan namah lain ku kuring mreun. 😂 ku domba dai nyh
Teks bahasa Indonesia: tapi anak domba itu mungkin bukan punya saya. 😂 itu punya domba katanya

 3 
Teks bahasa Sunda: Aduh mang naha bet kudu penting di upload ma😂😂
Teks bahasa Indonesia: Aduh kang kenapa harus penting diupload mah😂😂

 4 
Teks bahasa Sunda: pokonamah nuhun sabandungeun , kita terus tingkatkeun skill babarengan mang :) KAPAN NIKAH ? :(
Teks bahasa Indonesia: Pokoknya terima kasih banyak, kita terus tingkatkan keterampilan bersama ya :) KAPAN NIKAH? :(

 5 
Teks bahasa Sunda: Kang eta teu isin?? Apa emng tukang ngisinkeun wkwk😅
Teks bahasa Indonesia: Apakah dia tidak malu?? Apa memang tukang membuat malu wkwk😅

 6 
Teks bahasa Sunda: Artis artis saha anu sok ditungguan setiap bulan puasa??? Afan magrib

In [14]:
df.to_csv(data_sun2idn_path, index=False)
print("Saved to:", data_sun2idn_path)

Saved to: data/back_translated_data_raw/track_a/sun/train_sun2idn.csv


## Back-Translate Data from Indonesian to Sundanese

In [16]:
df_sun2idn = pd.read_csv('data/back_translated_data_raw/track_a/sun/train_sun2idn_curated.csv')
df_sun2idn.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,text_sun2idn
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang salam bandung soreang,0,0,0,1,0,1,"Bagaimana nanti cerita fiksi ke komunitas, salam dari Bandung Soreang."
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. 😂 ku domba dai nyh,0,0,0,1,0,1,tapi anak domba itu mungkin bukan punya saya. 😂 itu punya domba katanya
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0,Aduh kang kenapa harus penting diupload mah😂😂
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus tingkatkeun skill babarengan mang :) KAPAN NIKAH ? :(",0,0,0,1,1,0,"Pokoknya terima kasih banyak, kita terus tingkatkan keterampilan bersama ya :) KAPAN NIKAH? :("
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun wkwk😅,0,0,0,1,0,1,Apakah dia tidak malu?? Apa memang tukang membuat malu wkwk😅


In [17]:
translate_sun2idn2sun_prompt = """Tarjamahkeun teks basa Sunda ka basa Indonesia pikeun kapentingan dataset NLP

Teks basa Indonesia: {text_sun2idn}
Teks basa Sunda: """

def translate_sun2idn2sun(text_sun2idn):
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            # {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                'content': translate_sun2idn2sun_prompt.format(text_sun2idn=text_sun2idn),
            },
        ],
        max_tokens=100,
    )
    return completion.choices[0].message.content

for i, row in df_sun2idn.iterrows():
    print("=" * 64, "\n", i+1, "\n", "=" * 64)
    print("Teks basa Indonesia:", row['text_sun2idn'])

    text_sun2idn2sun = translate_sun2idn2sun(row['text_sun2idn'])
    df_sun2idn.at[i, 'text_sun2idn2sun'] = text_sun2idn2sun

    print("Teks basa Sunda:", text_sun2idn2sun)    
    print()

 1 
Teks basa Indonesia: Bagaimana nanti cerita fiksi ke komunitas, salam dari Bandung Soreang.
Teks basa Sunda: Kumaha engke carita fiksi ka komunitas, salam ti Bandung Soreang.

 2 
Teks basa Indonesia: tapi anak domba itu mungkin bukan punya saya. 😂 itu punya domba katanya
Teks basa Sunda: Tapi anak domba éta sigana lain aing nu boga. 😂 Éta mah kagungan domba cenah.

 3 
Teks basa Indonesia: Aduh kang kenapa harus penting diupload mah😂😂
Teks basa Sunda: Aduh kang kunaon kudu penting diunggah mah😂😂

 4 
Teks basa Indonesia: Pokoknya terima kasih banyak, kita terus tingkatkan keterampilan bersama ya :) KAPAN NIKAH? :(
Teks basa Sunda: Pokokna hapunten ageung, urang terus ningkatkeun kaahlian babarengan nya :) IRaha kawin? :(

 5 
Teks basa Indonesia: Apakah dia tidak malu?? Apa memang tukang membuat malu wkwk😅
Teks basa Sunda: Naha manehna teu era?? Na memang tukang nyieun era wkwk😅

 6 
Teks basa Indonesia: Artis artis siapa yang selalu dinantikan setiap bulan puasa??? Adzan magrib
T

In [20]:
df_sun2idn.to_csv(data_sun2idn2sun_path, index=False)
print("Saved to:", data_sun2idn2sun_path)

Saved to: data/back_translated_data_raw/track_a/sun/train_sun2idn2sun.csv


## Postprocess Back-Translated Data

In [21]:
df_sun2idn2sun = pd.read_csv('data/back_translated_data_raw/track_a/sun/train_sun2idn2sun_curated.csv')
df_sun2idn2sun.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,text_sun2idn,text_sun2idn2sun
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang salam bandung soreang,0,0,0,1,0,1,"Bagaimana nanti cerita fiksi ke komunitas, salam dari Bandung Soreang.","Kumaha engke carita fiksi ka komunitas, salam ti Bandung Soreang."
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. 😂 ku domba dai nyh,0,0,0,1,0,1,tapi anak domba itu mungkin bukan punya saya. 😂 itu punya domba katanya,Tapi anak domba éta sigana lain aing nu boga. 😂 Éta mah kagungan domba cenah.
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0,Aduh kang kenapa harus penting diupload mah😂😂,Aduh kang kunaon kudu penting diunggah mah😂😂
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus tingkatkeun skill babarengan mang :) KAPAN NIKAH ? :(",0,0,0,1,1,0,"Pokoknya terima kasih banyak, kita terus tingkatkan keterampilan bersama ya :) KAPAN NIKAH? :(","Pokokna hapunten ageung, urang terus ningkatkeun kaahlian babarengan nya :) IRaha kawin? :("
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun wkwk😅,0,0,0,1,0,1,Apakah dia tidak malu?? Apa memang tukang membuat malu wkwk😅,Naha manehna teu era?? Na memang tukang nyieun era wkwk😅


In [22]:
df_sun2idn2sun['text'] = df_sun2idn2sun['text_sun2idn2sun']
final_df = df_sun2idn2sun.drop(['text_sun2idn', 'text_sun2idn2sun'], axis=1)
final_df.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,sun_train_track_a_00001,"Kumaha engke carita fiksi ka komunitas, salam ti Bandung Soreang.",0,0,0,1,0,1
1,sun_train_track_a_00002,Tapi anak domba éta sigana lain aing nu boga. 😂 Éta mah kagungan domba cenah.,0,0,0,1,0,1
2,sun_train_track_a_00003,Aduh kang kunaon kudu penting diunggah mah😂😂,0,0,0,1,0,0
3,sun_train_track_a_00004,"Pokokna hapunten ageung, urang terus ningkatkeun kaahlian babarengan nya :) IRaha kawin? :(",0,0,0,1,1,0
4,sun_train_track_a_00005,Naha manehna teu era?? Na memang tukang nyieun era wkwk😅,0,0,0,1,0,1


In [23]:
final_df.to_csv(final_data_path, index=False)
print("Saved to:", final_data_path)

Saved to: data/back_translated_data_raw/track_a/sun/train_final.csv
