In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import warnings
import cupy as cp
from sklearnex import patch_sklearn

In [2]:
def mainBareBones():
    # Suppress warnings
    warnings.filterwarnings("ignore", category=FutureWarning)

    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000) 
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # MLP Model
    print("\nTraining the MLP model...")
    model = MLPClassifier(
        hidden_layer_sizes=(50, 30),  
        max_iter=1200,               
        learning_rate='adaptive',            
        early_stopping=True,        
        random_state=42              
    )
    model.fit(X_train, y_train)
    print("Model training completed!")

    # Predictions on test data
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(X_test)

    # Evaluation
    print("\nEvaluating the model...")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainBareBones()


Training the MLP model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
              precision    recall  f1-score   support

  hatespeech       0.66      0.70      0.68      1325
      normal       0.62      0.72      0.67      1502
   offensive       0.53      0.38      0.44      1195

    accuracy                           0.61      4022
   macro avg       0.60      0.60      0.60      4022
weighted avg       0.61      0.61      0.60      4022



In [3]:
def mainCuPY():
    # Suppress warnings
    warnings.filterwarnings("ignore", category=FutureWarning)

    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(data['comment'])
    X = cp.array(X.toarray(), dtype=cp.float32)  # Convert to CuPy array

    # Target variable
    y = cp.array(data['label'].values, dtype=cp.int32)

    # Train-test split
    print("\nSplitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X.get(), y.get(), test_size=0.2, random_state=42
    )  # Convert back to NumPy
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

    # MLP Model
    print("\nTraining the MLP model...")
    model = MLPClassifier(
        hidden_layer_sizes=(50, 30),  
        max_iter=1200,               
        learning_rate='adaptive',            
        early_stopping=True,        
        random_state=42              
    )
    model.fit(X_train, y_train)
    print("Model training completed!")

    # Predictions on test data
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(X_test)

    # Evaluation
    print("\nEvaluating the model...")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainCuPY()



Vectorizing text data with TF-IDF...

Splitting data into training and testing sets...
Training set shape: (16087, 5000), Testing set shape: (4022, 5000)

Training the MLP model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
              precision    recall  f1-score   support

  hatespeech       0.66      0.70      0.68      1325
      normal       0.62      0.72      0.67      1502
   offensive       0.53      0.38      0.44      1195

    accuracy                           0.61      4022
   macro avg       0.60      0.60      0.60      4022
weighted avg       0.61      0.61      0.60      4022



In [None]:
patch_sklearn() # Apply Intel optimizations

In [4]:
def mainIntelex():
    # Suppress warnings
    warnings.filterwarnings("ignore", category=FutureWarning)

    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000) 
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # MLP Model
    print("\nTraining the MLP model...")
    model = MLPClassifier(
        hidden_layer_sizes=(50, 30),  
        max_iter=1200,               
        learning_rate='adaptive',            
        early_stopping=True,        
        random_state=42              
    )
    model.fit(X_train, y_train)
    print("Model training completed!")

    # Predictions on test data
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(X_test)

    # Evaluation
    print("\nEvaluating the model...")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainIntelex()


Training the MLP model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
              precision    recall  f1-score   support

  hatespeech       0.66      0.70      0.68      1325
      normal       0.62      0.72      0.67      1502
   offensive       0.53      0.38      0.44      1195

    accuracy                           0.61      4022
   macro avg       0.60      0.60      0.60      4022
weighted avg       0.61      0.61      0.60      4022

