In [15]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [16]:
import pandas as pd

def read_data(path):
    df = pd.read_csv(path)
    df = df.dropna(subset=["Content", "Label"])
    df = df.sample(frac=0.2)
    df = df.reset_index(drop=True)  # Optional: reset index after sampling
    return df


In [17]:
path='/home/abolfazl/Documents/CitizenJournal/citizen_journal/backend/fastapi_backend/HateSpeech/HateSpeechDatasetBalanced.csv'
df=read_data(path)

In [18]:
X_text = df['Content']
y = df['Label']

In [19]:

vectorizer = TfidfVectorizer(
    max_features=4000,       
    stop_words='english',   
    ngram_range=(1, 3)       
)

In [20]:
#Fit and transform the text data
X_tfidf = vectorizer.fit_transform(X_text)  # Sparse matrix

# Optional: check shape
print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (145224, 4000)


In [21]:
import joblib

# Save the fitted vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer_4000.pkl')


['tfidf_vectorizer_4000.pkl']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y,
    test_size=0.2,       # 20% test, 80% train
    random_state=42,     # for reproducibility
    stratify=y           # optional: ensures class balance in train/test
)

In [23]:
from keras import layers, models, callbacks

model = tf.keras.Sequential([
    layers.Input(shape=(4000,)),

    layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    layers.Dropout(0.5),

    layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    layers.Dropout(0.5),

    layers.Dense(16, activation='relu'),
    layers.Dropout(0.3),

    layers.Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,  # Make sure this is dense, not sparse!
    validation_split=0.2,
    epochs=20,
    batch_size=512,
    callbacks=[early_stop]
)


Epoch 1/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 54ms/step - accuracy: 0.7437 - loss: 0.6822 - val_accuracy: 0.7867 - val_loss: 0.5390
Epoch 2/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 0.7881 - loss: 0.5418 - val_accuracy: 0.7907 - val_loss: 0.5253
Epoch 3/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 0.7941 - loss: 0.5285 - val_accuracy: 0.7946 - val_loss: 0.5195
Epoch 4/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.7989 - loss: 0.5222 - val_accuracy: 0.7954 - val_loss: 0.5168
Epoch 5/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 0.8046 - loss: 0.5166 - val_accuracy: 0.8008 - val_loss: 0.5172
Epoch 6/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 0.8088 - loss: 0.5124 - val_accuracy: 0.8013 - val_loss: 0.5186
Epoch 7/20
[1m1

In [24]:
# Accuracy on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7947 - loss: 0.5154
Test Accuracy: 0.7947


In [25]:
from sklearn.metrics import classification_report

y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

print(classification_report(y_test, y_pred))


[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step
              precision    recall  f1-score   support

           0       0.82      0.75      0.78     14397
           1       0.77      0.84      0.80     14648

    accuracy                           0.79     29045
   macro avg       0.80      0.79      0.79     29045
weighted avg       0.80      0.79      0.79     29045



In [26]:
# Save the model
#model.save('/home/abolfazl/Documents/CitizenJournal/citizen_journal/backend/fastapi_backend/HateSpeech/hate_speech_model.keras')


In [27]:
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.95).astype(int).flatten()


[1m908/908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step


In [28]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5167843002237906

Classification Report:
               precision    recall  f1-score   support

           0       0.51      1.00      0.67     14397
           1       0.97      0.04      0.08     14648

    accuracy                           0.52     29045
   macro avg       0.74      0.52      0.38     29045
weighted avg       0.74      0.52      0.37     29045

