In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import joblib

# Download required NLTK data

In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading omw-1.4: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

# Load and preprocess data

In [3]:
data = pd.read_csv(r"P:\PROG\miniproject\Hate-speech-model\data\HateSpeechData.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
data["labels"] = data["class"].map(
    {0: "Hate Speech", 1: "Offensive Language", 2: "Normal"}
)
data = data[["tweet", "labels"]]

# Text cleaning function

In [5]:
def clean_text(text):
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    text = str(text).lower()
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

In [6]:
data["cleaned_tweet"] = data["tweet"].apply(clean_text)

# Split the data

In [7]:
X = data["cleaned_tweet"]
y = data["labels"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a pipeline

In [8]:
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer(max_features=5000)),
        ("clf", RandomForestClassifier(random_state=42)),
    ]
)

# Define hyperparameters for grid search

In [9]:
param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2],
}

In [10]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
# Get the best model
best_model = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)


Best Parameters: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5, 'clf__n_estimators': 200, 'tfidf__ngram_range': (1, 2)}


In [None]:
# Perform cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=5)
print("\nCross-validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


Cross-validation Scores: [0.88642324 0.89550131 0.91022796 0.90617433 0.90536723]
Mean CV Score: 0.9007388154757223


In [None]:
joblib.dump(best_model, "best_hate_speech_model.pkl")

['best_hate_speech_model.pkl']

In [None]:
# Function to classify text
def classify_text(text):
    cleaned_text = clean_text(text)
    return best_model.predict([cleaned_text])[0]

In [None]:
# Test the classifier
print("\nClassification Results:")
print("'You are so kind and helpful':", classify_text("You are so kind and helpful"))
print("'I hate you, you're stupid':", classify_text("I hate you, you're stupid"))
print("'This movie sucks':", classify_text("This movie sucks"))
print("'your are bitch':", classify_text("your are bitch"))


Classification Results:
'You are so kind and helpful': Normal
'I hate you, you're stupid': Offensive Language
'This movie sucks': Offensive Language
'your are bitch': Offensive Language


# Predict on the test set

In [None]:
y_pred = best_model.predict(X_test)

# Calculate accuracy

In [None]:
from sklearn.metrics import classification_report, accuracy_score

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8997377446035909


# Generate classification report

In [None]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
                     precision    recall  f1-score   support

       Hate Speech       0.53      0.14      0.23       290
            Normal       0.82      0.90      0.86       835
Offensive Language       0.93      0.96      0.94      3832

          accuracy                           0.90      4957
         macro avg       0.76      0.67      0.68      4957
      weighted avg       0.88      0.90      0.89      4957

