In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression  # Add Logistic Regression
from sklearn.metrics import accuracy_score, classification_report

In [56]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words("english"))
import joblib
from google.colab import files

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [57]:
df = pd.read_csv("/content/twitter_data.csv")

In [58]:
df['labels'] = df['class'].map({0: "Hate Speech Detected", 1: "Offensive language detected", 2: "No hate and offensive speech"})

In [59]:
df = df[['tweet', 'labels']]

In [60]:
lemmatizer = WordNetLemmatizer()

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\..S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [lemmatizer.lemmatize(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

df["tweet"] = df["tweet"].apply(clean)

In [61]:
tfidf = TfidfVectorizer(max_features=5000) # Add max_features for better performance
x = tfidf.fit_transform(df["tweet"])
y = np.array(df["labels"])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize GridSearchCV for SVM
grid_svc = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid_svc.fit(X_train, y_train)

# Print the best parameters for SVM
print(f"Best Parameters for SVM: {grid_svc.best_params_}")

# Use the best estimator for SVM to make predictions
best_svc = grid_svc.best_estimator_

# Train a RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Train a Logistic Regression model
logreg_clf = LogisticRegression(max_iter=1000, random_state=42)  # Add Logistic Regression
logreg_clf.fit(X_train, y_train)

# Create an ensemble model with Logistic Regression
ensemble_clf = VotingClassifier(estimators=[
    ('svc', best_svc), ('rf', rf_clf), ('lr', logreg_clf)], voting='hard')  # Add lr to the ensemble
ensemble_clf.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy: {accuracy:.2f}")

# Print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))

# Use cross-validation to validate the model
cv_scores = cross_val_score(ensemble_clf, x, y, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy: {cv_scores.mean():.2f}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  11.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  10.7s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  11.3s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  12.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  11.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  18.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  21.4s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  18.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  17.9s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  18.6s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=  10.9s
[CV] END ...................C=0.1, gamma=auto, k

In [62]:
test_data = ["I will kill you", "you are awesome", "you are bad i don't like you",
             "I hate people from that country." , "You are such a useless person." , "Go back to where you came from" ,
             "ou are so stupid and annoying." , "Shut up, you idiot!"]
for text in test_data:
    df = tfidf.transform([text]).toarray()
    print(f"Text: {text} -> Prediction: {ensemble_clf.predict(df)[0]}")

Text: I will kill you -> Prediction: Hate Speech Detected
Text: you are awesome -> Prediction: No hate and offensive speech
Text: you are bad i don't like you -> Prediction: Offensive language detected
Text: I hate people from that country. -> Prediction: Offensive language detected
Text: You are such a useless person. -> Prediction: Offensive language detected
Text: Go back to where you came from -> Prediction: Offensive language detected
Text: ou are so stupid and annoying. -> Prediction: Offensive language detected
Text: Shut up, you idiot! -> Prediction: Offensive language detected


In [63]:
joblib_file = "hate_speech_model.h5"
joblib.dump(ensemble_clf, joblib_file)
print(f"Model saved to {joblib_file}")

# Save the TF-IDF Vectorizer
tfidf_file = "tfidf_vectorizer.pkl"
joblib.dump(tfidf, tfidf_file)
print(f"TF-IDF Vectorizer saved to {tfidf_file}")

files.download(joblib_file)
files.download(tfidf_file)

Model saved to hate_speech_model.h5
TF-IDF Vectorizer saved to tfidf_vectorizer.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>