In [122]:
import pandas as pd

# Contoh dataframe

df = pd.read_csv('merged_output.csv')

# Distribusi label
df['label'].value_counts()

depression    21208
anxiety       19976
lonely        11545
Name: label, dtype: int64

In [123]:
df['post'] = df['post'].astype(str)
df['label'] = df['label'].astype(str)
print(df.dtypes)

post     object
label    object
dtype: object


In [124]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
print(df.dtypes)

post     object
label     int32
dtype: object


In [125]:
df = df.drop(df[df['label'] == 3].index)
print(df['label'].value_counts())

1    21208
0    19976
2    11545
Name: label, dtype: int64


In [126]:
# Periksa distribusi label
print(df['label'].value_counts())

# Periksa panjang teks
df['text_length'] = df['post'].apply(len)
print(df['text_length'].describe())

# Periksa beberapa contoh teks
print(df['post'].head())

1    21208
0    19976
2    11545
Name: label, dtype: int64
count    52729.000000
mean       867.364998
std        876.954509
min          7.000000
25%        339.000000
50%        623.000000
75%       1099.000000
max      32765.000000
Name: text_length, dtype: float64
0    I can always feel my heartbeat 18M, physically...
1    My dad is hospitalized and had to be put in a ...
2    Feels like anxiety is turning my brain to stew...
3    What do you do when anxiety kicks in around ot...
4    Finding a job is a nightmare I want to rant ab...
Name: post, dtype: object


In [127]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Inisialisasi stemmer dan lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Inisialisasi stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Menghapus URL
    text = re.sub(r'http\S+', '', text)
    # Menghapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    # Menghapus stopwords dan melakukan stemming atau lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Terapkan pembersihan teks pada kolom 'text'
df['cleaned_text'] = df['post'].apply(clean_text)

# Pratinjau teks yang telah dibersihkan
print(df[['post', 'cleaned_text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                post  \
0  I can always feel my heartbeat 18M, physically...   
1  My dad is hospitalized and had to be put in a ...   
2  Feels like anxiety is turning my brain to stew...   
3  What do you do when anxiety kicks in around ot...   
4  Finding a job is a nightmare I want to rant ab...   

                                        cleaned_text  
0  always feel heartbeat physically fit im wonder...  
1  dad hospitalized put breathing machine help br...  
2  feel like anxiety turning brain stew lately iv...  
3  anxiety kick around people wave anxiety hit co...  
4  finding job nightmare want rant job hunting ma...  


In [133]:
df.head()

Unnamed: 0,label,cleaned_text
0,0,always feel heartbeat physically fit im wonder...
1,0,dad hospitalized put breathing machine help br...
2,0,feel like anxiety turning brain stew lately iv...
3,0,anxiety kick around people wave anxiety hit co...
4,0,finding job nightmare want rant job hunting ma...


In [132]:
df = df.drop(columns=['text_length','post'])
df.head()

KeyError: "['text_length', 'post'] not found in axis"

====================================

In [109]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import numpy as np

In [136]:
# Mengubah data teks menjadi representasi numerik (contoh menggunakan TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Melakukan undersampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_vectorized, y)

# Membuat dataframe baru dari hasil resampling
df_resampled = pd.DataFrame(X_resampled.todense(), columns=vectorizer.get_feature_names_out())
df_resampled['label'] = y_resampled

In [137]:
# Pisahkan data menjadi train dan validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_resampled.drop(columns=['label']), df_resampled['label'], test_size=0.2, random_state=42
)

# Mengubah data kembali ke format asli (teks)
train_texts = vectorizer.inverse_transform(train_texts)
val_texts = vectorizer.inverse_transform(val_texts)

train_texts = [' '.join(text) for text in train_texts]
val_texts = [' '.join(text) for text in val_texts]


In [141]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenisasi data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Pastikan tidak ada nilai None dalam encoding
def check_none(data_dict):
    for key, value in data_dict.items():
        if value is None:
            print(f"Found None in {key}")
            return False
    return True

assert check_none(train_encodings)
assert check_none(val_encodings)

# Konversi ke dataset TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    np.array(train_labels)  # Pastikan label dalam bentuk array numpy
)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    np.array(val_labels)
)).batch(16)


In [142]:
# Load model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['label'].unique()))

# Kompilasi model
model.compile(optimizer='adam', loss=model.compute_loss, metrics=['accuracy'])

# Melatih model
model.fit(train_dataset, validation_data=val_dataset, epochs=3)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


AttributeError: in user code:

    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step  *
        outputs = model.train_step(data)
    File "c:\Users\acer\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 1706, in train_step  *
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\engine\compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\losses.py", line 143, in __call__  *
        losses = call_fn(y_true, y_pred)
    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\losses.py", line 270, in call  *
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\acer\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 1588, in compute_loss  *
        return super().compute_loss(*args, **kwargs)
    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\engine\training.py", line 1207, in compute_loss  *
        y, y_pred, sample_weight, regularization_losses=self.losses
    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\engine\compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "c:\Users\acer\anaconda3\lib\site-packages\tf_keras\src\engine\compile_utils.py", line 854, in match_dtype_and_rank  *
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'


In [144]:
df.to_csv('cleaned-text.csv', index=False)