## DATA LOADING AND PREPROCESSING


In [10]:
import pandas as pd

# Memuat dataset
df = pd.read_csv('cleaned-text-2.csv')

# Melihat beberapa baris pertama dari dataset
print(df.head())


   label                                       cleaned_text
0      2  aa hth here cheerful merry new year peep waana...
1      0  aaaa help keep trying talk boy sits close lunc...
2      1  aaaaaaaaa cant fucking cry want cry eye wont l...
3      1  aaaaaaaaaaaaaaaaaaaaaah wish didnt act fake en...
4      0  aaaaaaaaaahhhhhhh feel like fault feel like fa...


In [11]:
# Preprocess text
import re 
# Preprocess text
def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Lowercase text
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    else:
        text = str(text)  # Convert non-string values to string
    return text

df['cleaned_text'] = df['cleaned_text'].apply(preprocess_text)

# Remove rows with empty or NaN text
df = df.dropna(subset=['cleaned_text'])

In [12]:
# Identify non-string values
non_string_values = df[df['cleaned_text'].apply(lambda x: not isinstance(x, str))]
print("Non-string values in 'object32':\n", non_string_values)

# Convert the entire column to string
df['cleaned_text'] = df['cleaned_text'].astype(str)

# Verify the conversion
print("\nDataFrame after converting 'object32' to string:\n", df)

Non-string values in 'object32':
 Empty DataFrame
Columns: [label, cleaned_text]
Index: []

DataFrame after converting 'object32' to string:
        label                                       cleaned_text
0          2  aa hth here cheerful merry new year peep waana...
1          0  aaaa help keep trying talk boy sits close lunc...
2          1  aaaaaaaaa cant fucking cry want cry eye wont l...
3          1  aaaaaaaaaaaaaaaaaaaaaah wish didnt act fake en...
4          0  aaaaaaaaaahhhhhhh feel like fault feel like fa...
...      ...                                                ...
52622      0  zoloft wellbutrin im feeling angry everything ...
52623      0  zoloft year ago started taking zoloft anxiety ...
52624      0  zoloftdid help anxiety im probably going get z...
52625      0  zoned cut self harm never far cutting never do...
52626      0  zoning social situation lot time im around lot...

[52627 rows x 2 columns]


In [13]:
df.dtypes

label            int64
cleaned_text    object
dtype: object

In [14]:
from sklearn.utils import resample

# Menentukan jumlah sampel untuk setiap label
n_samples = 3000

# Memisahkan dataset berdasarkan label
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]
df_label_2 = df[df['label'] == 2]

# Melakukan downsampling atau upsampling jika perlu untuk mendapatkan 3000 sampel dari setiap label
df_label_0_sampled = resample(df_label_0, replace=True, n_samples=n_samples, random_state=123)
df_label_1_sampled = resample(df_label_1, replace=True, n_samples=n_samples, random_state=123)
df_label_2_sampled = resample(df_label_2, replace=True, n_samples=n_samples, random_state=123)

# Menggabungkan kembali dataset yang telah di-sample
df_downsampled = pd.concat([df_label_0_sampled, df_label_1_sampled, df_label_2_sampled])

# Mengacak ulang dataset
df_downsampled = df_downsampled.sample(frac=1, random_state=123).reset_index(drop=True)

print(df_downsampled['label'].value_counts())

0    3000
2    3000
1    3000
Name: label, dtype: int64


## MODELING

In [15]:
from transformers import BertTokenizer
import tensorflow as tf

# Menggunakan tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Fungsi untuk tokenisasi
def encode(text, label):
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )
    return {
        'input_ids': encoded_text['input_ids'][0],
        'attention_mask': encoded_text['attention_mask'][0],
        'label': label
    }

# Menerapkan tokenisasi pada dataset
encoded_data = df_downsampled.apply(lambda x: encode(x['cleaned_text'], x['label']), axis=1)

# Membuat TensorFlow dataset
input_ids = tf.stack(encoded_data.apply(lambda x: x['input_ids']))
attention_masks = tf.stack(encoded_data.apply(lambda x: x['attention_mask']))
labels = tf.convert_to_tensor(df_downsampled['label'].values)

dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': input_ids,
    'attention_mask': attention_masks
}, labels))

# Membagi dataset menjadi train dan validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

train_dataset = train_dataset.batch(32)
val_dataset = val_dataset.batch(32)




In [16]:
from transformers import TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['label'].unique()))

# Kompilasi model
model.compile(optimizer=Adam(learning_rate=5e-5), loss=model.hf_compute_loss, metrics=['accuracy'])

# Melatih model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=3)



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
model.save_pretrained('modelbert')
tokenizer.save_pretrained('modelbert')

('modelbert\\tokenizer_config.json',
 'modelbert\\special_tokens_map.json',
 'modelbert\\vocab.txt',
 'modelbert\\added_tokens.json')

## EVALUATE MODEL

In [19]:
# Menghitung akurasi pada validation dataset
loss, accuracy = model.evaluate(val_dataset)

print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')


Validation Loss: 0.5365952849388123
Validation Accuracy: 0.8277778029441833


In [20]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Mendapatkan prediksi dari model
predictions = model.predict(val_dataset)
predicted_labels = np.argmax(predictions.logits, axis=1)

# Mendapatkan label sebenarnya dari validation dataset
true_labels = []
for _, labels in val_dataset:
    true_labels.extend(labels.numpy())

# Menghitung MAE dan MSE
mae = mean_absolute_error(true_labels, predicted_labels)
mse = mean_squared_error(true_labels, predicted_labels)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')

Mean Absolute Error (MAE): 0.20277777777777778
Mean Squared Error (MSE): 0.2638888888888889


========================