In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearnex import patch_sklearn
import cupy as cp

In [2]:
def mainBareBones():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predictions on test data
    y_pred = model.predict(X_test)

    # Evaluation
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainBareBones()


              precision    recall  f1-score   support

  hatespeech       0.72      0.66      0.69      1325
      normal       0.56      0.84      0.67      1502
   offensive       0.60      0.29      0.39      1195

    accuracy                           0.62      4022
   macro avg       0.63      0.60      0.58      4022
weighted avg       0.63      0.62      0.59      4022



In [3]:
def mainCuPY():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(data['comment'])

    # Convert sparse matrix to CuPy
    X = cp.sparse.csr_matrix(X)

    # Target variable
    y = cp.asarray(data['label'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train.get(), y_train.get())

    # Predictions on test data
    y_pred = model.predict(X_test.get())

    # Evaluation
    print(classification_report(y_test.get(), y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainCuPY()


              precision    recall  f1-score   support

  hatespeech       0.72      0.66      0.69      1325
      normal       0.56      0.84      0.67      1502
   offensive       0.60      0.29      0.39      1195

    accuracy                           0.62      4022
   macro avg       0.63      0.60      0.58      4022
weighted avg       0.63      0.62      0.59      4022



In [4]:
patch_sklearn()  # Apply Intel optimizations

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
def mainIntelex():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predictions on test data
    y_pred = model.predict(X_test)

    # Evaluation
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainIntelex()


              precision    recall  f1-score   support

  hatespeech       0.72      0.66      0.69      1325
      normal       0.56      0.84      0.67      1502
   offensive       0.60      0.29      0.39      1195

    accuracy                           0.62      4022
   macro avg       0.63      0.60      0.58      4022
weighted avg       0.63      0.62      0.59      4022

