In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm



In [None]:
# Load the dataset
df = pd.read_csv('HateSpeechDataset.csv')
df

In [None]:
# Define a function to preprocess the text
def preprocess_text(Content):
    # Remove URLs and mentions
    Content = re.sub(r'http\S+', '', Content)
    Content = re.sub(r'@\w+', '', Content)
    
    # Tokenize the text
    tokens = word_tokenize(Content.lower())
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Join the tokens back into a string
    Content = ' '.join(tokens)
    
    return Content

# Preprocess the text in the dataset
df['Content'] = df['Content'].apply(preprocess_Content)



In [None]:
# Define a function to load a multilingual hate speech dataset
def load_multilingual_dataset(language):
    # Load the dataset for the specified language
    df_lang = pd.read_csv(f'HateSpeechDataset_{language}.csv')
    
    # Preprocess the text in the dataset
    df_lang['Content'] = df_lang['Content'].apply(preprocess_text)
    
    return df_lang

# Load the multilingual hate speech datasets
df_es = load_multilingual_dataset('es')
df_fr = load_multilingual_dataset('fr')
df_de = load_multilingual_dataset('de')

# Concatenate the datasets
df_all = pd.concat([df, df_es, df_fr, df_de])

# Shuffle the dataset
df_all = df_all.sample(frac=1).reset_index(drop=True)

# Split the dataset into training and testing sets
train_size = int(0.8 * len(df_all))
train_df = df_all[:train_size]
test_df = df_all[train_size:]

# Define the input and output columns
X = train_df['Content']
y = train_df['Label']

# Define the model architecture
model = ...

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, validation_data=(test_df['Content'], test_df['Label']), epochs=10)