In [11]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [12]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
bert_model = TFAutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")


Some layers from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
def get_bert_embeddings(texts, tokenizer, model, max_len=64):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors='tf', max_length=max_len, truncation=True, padding='max_length')
        outputs = model(inputs)[0]  # shape: (1, seq_len, hidden)
        cls_vector = outputs[:, 0, :]  # فقط بردار CLS
        embeddings.append(cls_vector.numpy()[0])
    return np.array(embeddings)

In [14]:
def create_model():
    # مدل ساده LSTM
    model = Sequential()
    model.add(LSTM(128, input_shape=(1, 768)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [15]:
df = pd.read_csv("clean_snapp.csv")
texts = df['comment_cleaned'].astype(str).tolist()
labels = df['label'].values


In [None]:
X = get_bert_embeddings(texts, tokenizer, bert_model)
y = np.array(labels)


  2%|▏         | 438/18508 [01:31<58:50,  5.12it/s]

In [None]:
X = X.reshape((X.shape[0], 1, X.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
model = create_model()
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# 5. ارزیابی
y_pred = (model.predict(X_test) > 0.5).astype(int)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

model.save("model.h5")

In [None]:
digikala_df = pd.read_csv("clean_digikala.csv", encoding='utf-8', quotechar='"')
digikala_texts = digikala_df['comment_cleaned'].astype(str).tolist()
digikala_embeds = get_bert_embeddings(digikala_texts, tokenizer, bert_model)
digikala_embeds = digikala_embeds.reshape((digikala_embeds.shape[0], 1, 768))
predictions = (model.predict(digikala_embeds) > 0.5).astype(int)
digikala_df['predicted_label'] = predictions
digikala_df['predicted_label'] = digikala_df['predicted_label'].map({1: 'SAD', 0: 'HAPPY'})
digikala_df.to_csv("outputs/digikala_labeled.csv", index=False)
print("Saved predictions to outputs/digikala_labeled.csv")