In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
import torch
import re
import string
from bs4 import BeautifulSoup
import seaborn as sns

# Transformers library imports
from transformers import (
    AutoModelForSequenceClassification,
    BertTokenizerFast,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments
)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA

In [None]:
col_names=['Customer Review', 'Emotion']
df = pd.read_csv('/kaggle/input/prdect-id-indonesian-emotion-classification/Product Reviews Dataset for Emotions Classification Tasks - Indonesian (PRDECT-ID) Dataset/PRDECT-ID Dataset.csv', usecols=['Customer Review', 'Emotion'])
df.head()

In [None]:
df.shape

In [None]:
df.describe(include='object')

In [None]:
df.rename(columns={'Customer Review': 'text', 'Emotion': 'label'}, inplace=True)

In [None]:
# is the data imbalance?
df.Emotion.value_counts() / df.shape[0] *100

In [None]:
sns.countplot(x='Emotion', data=df);

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Save the splits into separate CSV files
train_df.to_csv('reviews_train.csv', index=False)
val_df.to_csv('reviews_val.csv', index=False)
test_df.to_csv('reviews_test.csv', index=False)

print("Training, validation, and testing datasets have been saved to 'customer_reviews_train.csv', 'customer_reviews_val.csv', and 'customer_reviews_test.csv'.")

In [None]:
dataset = load_dataset('csv', data_files={'train': '/kaggle/input/encoded-emotion-labels-prdect-id/reviews_train_er.csv',
                                              'eval': '/kaggle/input/encoded-emotion-labels-prdect-id/reviews_val_er.csv',
                                              'test': '/kaggle/input/encoded-emotion-labels-prdect-id/reviews_test_er.csv'})

In [None]:
dataset = dataset.remove_columns("labels")

In [None]:
sample = dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")

# Text Preprocessing

In [None]:
# reference : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

In [None]:
def load_chat_words_dict(filepath):
    # Load the chat words dictionary from a CSV file
    df = pd.read_csv(filepath)
    chat_words_dict = dict(zip(df['slang'], df['formal']))
    return chat_words_dict

def preprocess_text(text, chat_words_dict):
    # Lower casing
    text = text.lower()
    
    # Removal of URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Removal of HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Removal of punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Removal of emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Removal of emoticons
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')

    # Custom dictionary for chat words conversion
    words = [chat_words_dict[word] if word in chat_words_dict else word for word in words]
    
    # Convert list back to string
    text = ' '.join(words)
    
    return text


# Load chat words dictionary from the CSV file
chat_words_dict = load_chat_words_dict('/kaggle/input/colloquial-indonesian-lexicon-csv/colloquial-indonesian-lexicon.csv')

# Example usage
sample_text = "kutu msih hidup, sdh saya tes langsung. merasa ditipu nih, semoga dapat musibah dah lu :("
processed_text = preprocess_text(sample_text, chat_words_dict)
print(sample_text)
print(processed_text)

In [None]:
# Define a function that wraps the preprocess_text function
def preprocess_batch(batch):
    batch['text'] = preprocess_text(batch['text'], chat_words_dict)
    return batch

In [None]:
# Apply the preprocessing function to the 'text' column of the dataset
processed_dataset = dataset.map(preprocess_batch)

# Label Encoding

In [None]:
emotion_labels = ['Happy', 'Sadness', 'Anger', 'Love', 'Fear']

In [None]:
num_labels = len(emotion_labels)

In [None]:
# Create a mapping dictionary
label2id = {label: i for i, label in enumerate(emotion_labels)}
id2label = {i: label for i, label in enumerate(emotion_labels)}

In [None]:
# Function to encode labels
def encode_labels(batch):
    batch['label'] = label2id[batch['label']]
    return batch

In [None]:
# Encode labels in the dataset
encoded_dataset = processed_dataset.map(encode_labels)

In [None]:
sample = encoded_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")

# Model Training

In [None]:
notebook_login()

In [None]:
model_name = 'indolem/indobertweet-base-uncased'

In [None]:
# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [None]:
# Tokenize function
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [None]:
# Tokenize the datasets
tokenized = encoded_dataset.map(tokenize, batched=True)

In [None]:
# Set the format to PyTorch
tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Remove the duplicate 'label' column
tokenized_datasets = tokenized.remove_columns(['text'])

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    
    # Handle predictions if logits are provided
    if isinstance(predictions, tuple):
        predictions = np.argmax(predictions[0], axis=1)
    elif isinstance(predictions, np.ndarray):
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions.tolist()

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels, 
    id2label=id2label, 
    label2id=label2id
)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./without-cq',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    save_total_limit=3,
    fp16=True,
    max_grad_norm=1.0,
    push_to_hub=True
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
# Train and evaluate
trainer.train()

# Model Evaluation

In [None]:
eval_result = trainer.evaluate()
print(f'Evaluation results: {eval_result}')

# Model Publishing

In [None]:
# # Push the model to the hub with a commit message
# trainer.push_to_hub()

# # Push the tokenizer to the hub with a commit message
# tokenizer.push_to_hub()

In [None]:
from huggingface_hub import HfApi

api = HfApi()

repo_id = "albarpambagio/without-cq"  

# List of tokenizer files
tokenizer_files = [
    "/kaggle/working/without-cq/config.json",
    "/kaggle/working/without-cq/special_tokens_map.json",
    "/kaggle/working/without-cq/tokenizer.json",
    "/kaggle/working/without-cq/tokenizer_config.json",
    "/kaggle/working/without-cq/vocab.txt"
]

for file in tokenizer_files:
    api.upload_file(
        path_or_fileobj=file,
        path_in_repo=file.split("/")[-1],  
        repo_id=repo_id
    )

# Model Testing

In [None]:
inf_model = AutoModelForSequenceClassification.from_pretrained("albarpambagio/indobertweet-base-uncased-emotion-recognition")

In [None]:
# Define a pipeline for text classification
emotion_classifier = pipeline(
    'text-classification',  # Specify the task as text classification
    model=inf_model,  # Pass the loaded model
    tokenizer='indolem/indobertweet-base-uncased'
)

In [None]:
text = "barangnya jelek banget"

# Perform emotion recognition
emotion_prediction = emotion_classifier(text)[0]
emotion_label = emotion_prediction['label']
emotion_score = emotion_prediction['score']

print(f"Predicted Emotion: {emotion_label}, Score: {emotion_score}")

In [None]:
# Perform emotion recognition
emotion_predictions = emotion_classifier(text, top_k=5)  # Get top 5 predictions

for prediction in emotion_predictions:
    emotion_label = prediction['label']
    emotion_score = prediction['score']
    print(f"Emotion: {emotion_label}, Score: {emotion_score}")

In [None]:
model_name_x = "albarpambagio/without-cq"

# Initialize tokenizer
tokenizer_x = BertTokenizerFast.from_pretrained(model_name)

In [None]:
# Sample text to tokenize
sample_text = "Rasanya tidak sesuai ekspektasi, bener-bener kecewa. Yang emak hanya VALRHONA CHOCOLATE, sisanya gak enak semua"

# Tokenize the sample text
tokens = tokenizer_x(sample_text)

# Print the tokenized output
print(tokens)

# References
* https://medium.com/@ahmettsdmr1312/fine-tuning-distilbert-for-emotion-classification-84a4e038e90e 
* https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing
* https://www.kaggle.com/code/mohamedabdelmohsen/emotion-analysis-and-classification-using-lstm-93
* https://www.kaggle.com/code/pashupatigupta/starter-notebook-a-to-z-emotion-detection
* https://huggingface.co/learn/nlp-course/
* https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners 