In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
# Function for text preprocessing
def preprocess_text(text):
    
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords and non-alphanumeric characters
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [6]:
# Load datasets
train_df = pd.read_csv('C:\\Users\\aysen\\Desktop\\hate_speech\\data\\annotations_metadata_train.csv')
test_df = pd.read_csv('C:\\Users\\aysen\\Desktop\\hate_speech\\data\\annotations_metadata_test.csv')

# Text preprocessing
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text)
test_df['preprocessed_text'] = test_df['text'].apply(preprocess_text)

In [7]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features based on your dataset size
X_train = vectorizer.fit_transform(train_df['preprocessed_text'])
X_test = vectorizer.transform(test_df['preprocessed_text'])
y_train = (train_df['label'] == 'hate').astype(int)
y_test = (test_df['label'] == 'hate').astype(int)

In [8]:
# Gradient Boosting Classifier
gbdt_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbdt_classifier.fit(X_train, y_train)

In [9]:
# Predictions
y_pred_train = gbdt_classifier.predict(X_train)
y_pred_test = gbdt_classifier.predict(X_test)

# Evaluation
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

# Classification Report
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.8155694879832811
Testing Accuracy: 0.7112970711297071
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.66      0.88      0.75       239
           1       0.82      0.54      0.65       239

    accuracy                           0.71       478
   macro avg       0.74      0.71      0.70       478
weighted avg       0.74      0.71      0.70       478

