In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

def mainBareBones():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000) 
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost Model
    model = XGBClassifier(eval_metric='mlogloss')
    model.fit(X_train, y_train)

    # Predictions on test data
    y_pred = model.predict(X_test)

    # Evaluation
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainBareBones()

KeyboardInterrupt: 

In [5]:
import cupy as cp
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

def mainCuPY():
    # Load the dataset with optimized datatypes
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column with LabelEncoder
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label']).astype('int8')

    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(data['comment'])

    # Convert the sparse matrix to CuPy format
    X_cupy = cp.sparse.csr_matrix(X, dtype=cp.float32)

    y = cp.asarray(data['label'], dtype=cp.int32)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_cupy, y, test_size=0.2, random_state=42
    )

    # XGBoost parameters
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'hist',  # Optimized for GPU
        'device': 'cuda',     # Use GPU
        'random_state': 42
    }

    # XGBoost Model
    model = XGBClassifier(**params)
    model.fit(X_train.get(), y_train.get())  # Convert CuPy arrays to NumPy

    # Predictions
    y_pred = model.predict(X_test.get())

    # Evaluation
    print(classification_report(y_test.get(), y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainCuPY()


              precision    recall  f1-score   support

  hatespeech       0.25      0.01      0.02      1325
      normal       0.39      0.91      0.55      1502
   offensive       0.15      0.07      0.09      1195

    accuracy                           0.36      4022
   macro avg       0.26      0.33      0.22      4022
weighted avg       0.27      0.36      0.24      4022



In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearnex import patch_sklearn
patch_sklearn()  # Apply Intel optimizations

def mainIntelex():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost Model
    model = XGBClassifier(eval_metric='mlogloss')
    model.fit(X_train, y_train)

    # Predictions on test data
    y_pred = model.predict(X_test)

    # Evaluation
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainIntelex()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


              precision    recall  f1-score   support

  hatespeech       0.75      0.66      0.71      1325
      normal       0.60      0.81      0.69      1502
   offensive       0.56      0.39      0.46      1195

    accuracy                           0.64      4022
   macro avg       0.64      0.62      0.62      4022
weighted avg       0.64      0.64      0.63      4022

