In [None]:
#  HATE SPEECH ANALYSIS

In [None]:
# ==============================
# Import Required Libraries
# ==============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc
)

In [None]:
# ==============================
# Load Dataset
# ==============================

# Load dataset from data folder
df = pd.read_csv("hate_speech_dataset.csv")

# Display first 5 rows
df.head()

In [None]:
# ==============================
# Basic Data Exploration
# ==============================

# Check dataset structure
df.info()

# Check shape
print("Dataset Shape:", df.shape)

# Check missing values
print("\nMissing Values:\n", df.isnull().sum())

# Display column names
print("\nColumns:", df.columns.tolist())

In [None]:
# ==============================
# Data Cleaning
# ==============================

# Columns containing boolean values
bool_columns = [
    'IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative',
    'IsObscene', 'IsHatespeech', 'IsRacist', 'IsNationalist',
    'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism'
]

# Convert TRUE/FALSE to 1/0
df[bool_columns] = df[bool_columns].replace({True: 1, False: 0})

print("Boolean columns converted successfully!")

In [None]:
## Exploratory Data Analysis

In [None]:
# Count of hate categories

hate_counts = df[bool_columns].sum()

plt.figure(figsize=(10,5))
hate_counts.plot(kind='bar')
plt.title("Count of Hate Categories")
plt.ylabel("Number of Comments")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Bivariate Analysis: Toxic vs Abusive Relationship

plt.figure(figsize=(8,5))
sns.violinplot(data=df, x="IsToxic", y="IsAbusive")
plt.title("Relationship between Toxic and Abusive Comments")
plt.xlabel("IsToxic")
plt.ylabel("IsAbusive")
plt.show()

In [None]:
# Racist vs Religious Hate Relationship

plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="IsRacist", y="IsReligiousHate")
plt.title("Racist vs Religious Hate Distribution")
plt.show()

In [None]:
# Correaltion Heatmap

plt.figure(figsize=(12,8))
sns.heatmap(df[bool_columns].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Between Hate Categories")
plt.show()

In [None]:
##  Linear Regression: Predict Hate Severity Score

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_lr = df[features]
y_lr = df['Total_Hate_Score']

X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
    X_lr, y_lr, test_size=0.2, random_state=42
)

lr_model = LinearRegression()
lr_model.fit(X_train_lr, y_train_lr)

y_pred_lr = lr_model.predict(X_test_lr)

print("MSE:", mean_squared_error(y_test_lr, y_pred_lr))
print("R2 Score:", r2_score(y_test_lr, y_pred_lr))

In [None]:
## Logistic Regression

In [None]:
# Feature columns
features = [
    'IsAbusive','IsThreat','IsProvocative','IsObscene',
    'IsRacist','IsNationalist','IsSexist',
    'IsHomophobic','IsReligiousHate','IsRadicalism'
]

X = df[features]
y = df['Hate_Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# ROC Curve for Logistic Regression

from sklearn.metrics import roc_curve, auc, RocCurveDisplay

# Predict probabilities
y_prob = model.predict_proba(X_test)[:, 1]

# Compute ROC values
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0,1], [0,1], linestyle='--')  # Random baseline
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Hate Speech Classification")
plt.legend()
plt.show()

In [None]:
## MySQL Database Integration (Demonstration)

db = mysql.connector.connect(
    host="localhost",
    user="root",
    password="your_password",
    database="Automated_hate_speech"
)

if db.is_connected():
    print("Connected to MySQL successfully!")

db.close()

In [None]:
## Key Business Insights

- Automated hate speech detection helps social media platforms proactively filter toxic comments, improving user safety and engagement.
- High co-occurrence of abusive, racist, and religious hate content indicates the need for multi-category moderation strategies.
- The Total Hate Score enables prioritization of highly offensive comments for faster review and action.
- Binary classification of comments into hate vs non-hate supports real-time automated content moderation systems.
- Data-driven hate speech analysis assists organizations in strengthening community guidelines and compliance monitoring.