In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aysen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aysen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aysen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Function for text preprocessing
def preprocess_text(text):
    
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords and non-alphanumeric characters
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [4]:
# Load datasets
train_df = pd.read_csv("C:\\Users\\aysen\\Documents\\GitHub\\hate_speech_models\\data\\annotations_metadata_train.csv")
test_df = pd.read_csv("C:\\Users\\aysen\\Documents\\GitHub\\hate_speech_models\\data\\annotations_metadata_test.csv")

# Text preprocessing
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text)
test_df['preprocessed_text'] = test_df['text'].apply(preprocess_text)

train_tokens = [token for tokens_list in train_df['text'] for token in tokens_list]
preprocessed_train_tokens = [token for tokens_list in train_df['preprocessed_text'] for token in tokens_list]

vocabulary_size = len(set(train_tokens))
vocabulary_size_preprocessed = len(set(preprocessed_train_tokens))

print(f'Vocabulary Size Preprocessed: {vocabulary_size_preprocessed}')
print(f'Vocabulary Size: {vocabulary_size}')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

Vocabulary Size Preprocessed: 53
Vocabulary Size: 111


In [17]:
# Text vectorization using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['preprocessed_text'])
X_val = vectorizer.transform(val_df['preprocessed_text'])
X_test = vectorizer.transform(test_df['preprocessed_text'])

#X_train = vectorizer.fit_transform(train_df['text'])
#X_val = vectorizer.transform(val_df['text'])
#X_test = vectorizer.transform(test_df['text'])

In [18]:
# Labels
y_train = (train_df['label'] == 'hate').astype(int)
y_val = (val_df['label'] == 'hate').astype(int)
y_test = (test_df['label'] == 'hate').astype(int)

In [20]:
# Hyperparameter tuning
best_accuracy = 0
best_model = None

for max_depth in [None, 5, 10, 15]:
    for min_samples_split in [2, 5, 10]:
        for min_samples_leaf in [1, 2, 4]:
            
            # Decision tree
            tree_classifier = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
            tree_classifier.fit(X_train, y_train)

            # Evaluate on the validation set
            val_predictions = tree_classifier.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_predictions)

            # Check if the current configuration is the best so far
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_model = tree_classifier

print(f'Best Max Depth: {best_model.max_depth}')
print(f'Best Min Samples Split: {best_model.min_samples_split}')
print(f'Best Min Samples Leaf: {best_model.min_samples_leaf}')

# Use the best model for evaluation on the test set
test_predictions = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f'Best Validation Accuracy: {best_accuracy}')
print(f'Test Accuracy with Best Model: {test_accuracy}')

Best Max Depth: None
Best Min Samples Split: 10
Best Min Samples Leaf: 1
Best Validation Accuracy: 0.7310704960835509
Test Accuracy with Best Model: 0.6841004184100419
