In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.linear_model import LogisticRegression

# Download NLTK stopwords
stopword = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")


In [None]:

# Load dataset
data = pd.read_csv("data.csv")
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate"})
data = data[["tweet", "labels"]]


In [None]:

# Data cleaning function
def clean(text):
    text = str(text).lower()
    text = re.sub('[.?]', '', text)
    text = re.sub('https?://\S+|www.\S+', '', text)
    text = re.sub('<.?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split() if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)


In [None]:

# Clean text data
data["tweet"] = data["tweet"].apply(clean)

# Prepare features and labels
x = np.array(data["tweet"])
y = np.array(data["labels"])

In [None]:

# Use TfidfVectorizer to extract features with max_features=5000
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), max_df=0.9, min_df=2)
X = vectorizer.fit_transform(x)


In [None]:

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:

# Define class weights (higher weight to underrepresented class 'Hate Speech')
class_weights = {
    "Hate Speech": 2,  # Higher weight for 'Hate Speech'
    "Offensive Speech": 1,
    "No Hate": 1
}


In [None]:

# Train Logistic Regression model with class weights
model_lr = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
model_lr.fit(X_train, y_train)


In [None]:

# Make predictions on the test data
y_pred_lr = model_lr.predict(X_test)


In [None]:
print("\nClassification Report for Logistic Regression:\n", classification_report(y_test, y_pred_lr))


Classification Report for Logistic Regression:
                   precision    recall  f1-score   support

     Hate Speech       0.31      0.59      0.41       465
         No Hate       0.74      0.92      0.82      1379
Offensive Speech       0.97      0.85      0.91      6335

        accuracy                           0.85      8179
       macro avg       0.67      0.79      0.71      8179
    weighted avg       0.89      0.85      0.86      8179



In [None]:
# Evaluate the Logistic Regression model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8474141093043159


In [None]:
from collections import Counter
# Count predictions
pred_counts = Counter(y_pred_lr)
print("Prediction Counts:", pred_counts)


Prediction Counts: Counter({'Offensive Speech': 5580, 'No Hate': 1707, 'Hate Speech': 892})


In [None]:
# Example prediction
example = "You’re less than human because of your race"
example_cleaned = clean(example)
# Example prediction with Logistic Regression
example_transformed_lr = vectorizer.transform([example_cleaned])
print("Logistic Regression Prediction for example text:", model_lr.predict(example_transformed_lr))

Logistic Regression Prediction for example text: ['Hate Speech']


In [16]:
# Example prediction
example = "Go back to your country"
example_cleaned = clean(example)
# Example prediction with Logistic Regression
example_transformed_lr = vectorizer.transform([example_cleaned])
print("Logistic Regression Prediction for example text:", model_lr.predict(example_transformed_lr))

Logistic Regression Prediction for example text: ['Hate Speech']


In [20]:
# Example prediction
example = "hoe"
example_cleaned = clean(example)
# Example prediction with Logistic Regression
example_transformed_lr = vectorizer.transform([example_cleaned])
print("Logistic Regression Prediction for example text:", model_lr.predict(example_transformed_lr))

Logistic Regression Prediction for example text: ['Offensive Speech']


In [26]:
# Example prediction
example = "Let’s work together to solve this issue"
example_cleaned = clean(example)
# Example prediction with Logistic Regression
example_transformed_lr = vectorizer.transform([example_cleaned])
print("Logistic Regression Prediction for example text:", model_lr.predict(example_transformed_lr))

Logistic Regression Prediction for example text: ['No Hate']


In [None]:
# Example prediction
example = "I think you're doing a great job"
example_cleaned = clean(example)
# Example prediction with Logistic Regression
example_transformed_lr = vectorizer.transform([example_cleaned])
print("Logistic Regression Prediction for example text:", model_lr.predict(example_transformed_lr))

Logistic Regression Prediction for example text: ['No Hate']


In [33]:
import joblib

# Sauvegardez le TfidfVectorizer et le modèle dans des fichiers séparés
joblib.dump(model_lr, 'logistic_regression_model.pkl')


['logistic_regression_model.pkl']

In [34]:
joblib.dump(vectorizer, 'TfidfVectorizer.pkl')

['TfidfVectorizer.pkl']