# Hate & Offensive Speech Classification Pipeline
This notebook walks through loading data, preprocessing text, building a TF–IDF + Decision Tree pipeline, evaluating, and hyperparameter tuning.

In [1]:
import pandas as pd
import numpy as np
import os
from cleantext import clean
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nbformat

ModuleNotFoundError: No module named 'pandas'

## 1. Environment Setup
Make sure you have installed the required libraries:
```bash
pip install pandas scikit-learn spacy clean-text nbformat
python -m spacy download en_core_web_sm
```

In [None]:
# 2. Load & Inspect Data
data = pd.read_csv('labeled_data.csv')
data['label'] = data['class'].map({0: 'hate speech', 1: 'offensive language', 2: 'neither'})
data.head(), data['label'].value_counts()

In [None]:
# 3. Define spaCy-based tokenizer
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
def spacy_tokenizer(text):
    # Clean with clean-text
    cleaned = clean(
        text,
        no_urls=True, no_emails=True, no_digits=True,
        no_punct=True, no_emoji=True, lower=True, lang='en'
    )
    doc = nlp(cleaned)
    return [token.lemma_.strip() for token in doc if token.is_alpha and not token.is_stop]


In [None]:
# 4. Build Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        tokenizer=spacy_tokenizer,
        lowercase=False,
        norm='l2',
        max_df=0.9,
        min_df=5,
        ngram_range=(1,2)
    )),
    ('clf', DecisionTreeClassifier(max_depth=15, min_samples_leaf=5, random_state=42))
])

In [None]:
# 5. Train/Test Split & Train
X_train, X_test, y_train, y_test = train_test_split(
    data['tweet'], data['label'], test_size=0.33, stratify=data['label'], random_state=42
)
pipeline.fit(X_train, y_train)

In [None]:
# 6. Evaluation
y_pred = pipeline.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred))

In [None]:
# 7. Hyperparameter Tuning with GridSearchCV
param_grid = {
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__max_depth': [10, 15, 20],
    'clf__min_samples_leaf': [1, 5, 10]
}
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print('Best params:', grid.best_params_)
print('Best CV score:', grid.best_score_)