# Hate Speech and Offensive Language Detection
- This scripts uses the hate speech detection model to classify a given text as hate speech, offensive speech or neither. 
- The model is trained using the dataset provided by the authors, im using LogisticRegression and DecisionTreeClassifier to train the model.
- The are some imbalance in the dataset, so i tried to used SMOTE, ADASYN and SMOTENC to balance the dataset. 
- The output of this model still not good enough

## Library

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import pickle

import warnings
warnings.filterwarnings('ignore')

## Download the Data

In [None]:
# import dataset from kaggle
!kaggle datasets download -d thedevastator/hate-speech-and-offensive-language-detection

In [None]:
!unzip hate-speech-and-offensive-language-detection.zip -d data_input

## Loading the Data

In [None]:
# load dataset
data = pd.read_csv('data_input/train.csv')
data.head()

In [None]:
data.info()

In [None]:
# check for missing values
data.isnull().sum()

In [None]:
# check for duplicates
data.duplicated().sum()

In [None]:
# check random tweet
data['tweet'][100]

## Data Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
# clean the data 

stop_words = set(stopwords.words('english'))
# add some more stop words
stop_words.add('rt')

## remove special characters
def remove_special_char(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

## remove urls
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

## remove usernames and hashtags
def remove_usernames_hashtags(text):
    return re.sub(r'@\w+|#\w+', '', text)

## remove extra spaces
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text)

##  clean the text
def clean_text(text):
    text = text.lower()
    text = remove_special_char(text)
    text = remove_urls(text)
    text = remove_usernames_hashtags(text)
    text = remove_extra_spaces(text)
    return text

In [None]:
# apply the cleaning function to the dataset

data['cleaned_tweet'] = data['tweet'].apply(clean_text)

data.head()

In [None]:
# compared cleaned tweet with original tweet

data[['tweet', 'cleaned_tweet']].head(10)


In [None]:
# remove stop words

def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

data['cleaned_tweet_two'] = data['cleaned_tweet'].apply(remove_stop_words)

data.head()


In [None]:
# compare cleaned tweet with original tweet

data[['tweet', 'cleaned_tweet', 'cleaned_tweet_two']].tail(10)


In [None]:
data.info()

In [None]:
# remove short words

def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

data['cleaned_tweet_three'] = data['cleaned_tweet_two'].apply(remove_short_words)

In [None]:
data[['tweet', 'cleaned_tweet', 'cleaned_tweet_two', 'cleaned_tweet_three']].tail(10)


In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
# find the most common words in the dataset

from collections import Counter

words = data['cleaned_tweet_three'].apply(lambda x: [word for word in x.split()])

# most common words in the dataset (general)
words = [word for sublist in words for word in sublist]
word_count = Counter(words)
word_count.most_common(20)

# visualize the most common words
import matplotlib.pyplot as plt

word_count_df = pd.DataFrame(word_count.most_common(20), columns=['word', 'count'])

plt.figure(figsize=(10, 5))
sns.barplot(x='word', y='count', data=word_count_df)
plt.xticks(rotation=45)
plt.show()

In [None]:
# same with before, but using word cloud

from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(' '.join(words))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# now im curios. how about the most common words for only the neutral class?
# im assumming that neutral class is number 2 in the class column from looking a bit of the dataset

neutral_tweets = data[data['class'] == 2]['cleaned_tweet_three']
neutral_words = neutral_tweets.apply(lambda x: [word for word in x.split()])

neutral_words = [word for sublist in neutral_words for word in sublist]
neutral_word_count = Counter(neutral_words)
neutral_word_count.most_common(20)

wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(' '.join(neutral_words))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')

In [None]:
# visualize the most common words in neutral tweets

neutral_word_count_df = pd.DataFrame(neutral_word_count.most_common(20), columns=['word', 'count'])

plt.figure(figsize=(10, 5))
sns.barplot(x='word', y='count', data=neutral_word_count_df)
plt.xticks(rotation=45)
plt.show()

well its quite unexpected, why does trash belong to neutral label? lets take a deeper look

In [None]:
# show thow many tweet that include the word trash in it

data[data['cleaned_tweet_three'].str.contains('trash')]
data[data['cleaned_tweet_three'].str.contains('trash')]['class'].value_counts()



In [None]:
# show five random  full tweets that include the word trash in it

pd.set_option('display.max_colwidth', None)
full_tweets_with_word_trash = data[data['cleaned_tweet_three'].str.contains('trash')]['cleaned_tweet_three'].sample(5)
print(full_tweets_with_word_trash)

after taking a quick look. there goes the exploratory data analysis.. i believed its kinda bias to the neutral class having a trash word on it, but i think its okay for now.

## Model

In [None]:
# classify the tweets between hate speech, offensive language and neither

tweet = list(data['cleaned_tweet_three'])
label = list(data['class'])

In [None]:
# split the data into training and testing set

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(tweet, label, test_size=0.2, random_state=42)

In [None]:
# vectorize the cleaned_tweet_three column
tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# see the shape of the vectorized data

X_train_tfidf.shape, X_test_tfidf.shape

In [None]:
# train the model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# make predictions
y_pred = model.predict(X_test_tfidf)

# evaluate the model
print(classification_report(y_test, y_pred))

In [None]:
# pipeline the model with count vectorizer and logistic regression

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

pipe = Pipeline([
    ('vectorizer', CountVectorizer(max_features=5000)),
    ('model', LogisticRegression())
])

pipe.fit(X_train, y_train)

# make predictions
y_pred_pipe = pipe.predict(X_test)

# evaluate the model
print(classification_report(y_test, y_pred_pipe))

In [None]:
# test the model with a hate speech tweet

tweet = ['I hate you']

tweet_tfidf = tfidf.transform(tweet)
model.predict(tweet_tfidf)

In [None]:
# test the model with a offensive speech tweet

tweet = ['You are so stupid']

tweet_tfidf = tfidf.transform(tweet)
model.predict(tweet_tfidf)


In [None]:
# test the model with a neutral tweet

tweet = ['I am learning data science']

tweet_tfidf = tfidf.transform(tweet)
model.predict(tweet_tfidf)


i'm assumming based on the result that 0 is hate speech, 1 is offensive language and 2 is neutral

In [None]:
# based on the 2 previous model its prettr sure has really imbalance class by looking at the score value on class 0. i will try to use SMOTE to balance the class

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)


In [None]:
# train the model with the balanced data

model_smote = LogisticRegression()
model_smote.fit(X_train_smote, y_train_smote)

# make predictions
y_pred_smote = model_smote.predict(X_test_tfidf)

# evaluate the model
print(classification_report(y_test, y_pred_smote))


soo... the model did improve on the recall. but the value is still not good enough. i will try to use hyperparameter tuning to improve the model

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_smote, y_train_smote)

# best parameters
grid_search.best_params_
print('Best parameters', grid_search.best_params_)

# best score

print('Best score', grid_search.best_score_)

# make predictions
y_pred_grid = grid_search.predict(X_test_tfidf)


In [None]:
print(classification_report(y_test, y_pred_grid))

nvm the model got worse on the class 0 score. i will try different model

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train_smote, y_train_smote)

# make predictions
y_pred_rf = model_rf.predict(X_test_tfidf)

# evaluate the model
print(classification_report(y_test, y_pred_rf))

still not good enough. i will try to use another model


In [None]:
# improve the model imbalance class ADASYN

from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)

X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_tfidf, y_train)

# train the model with the balanced data

model_adasyn = RandomForestClassifier()
model_adasyn.fit(X_train_adasyn, y_train_adasyn)

# make predictions
y_pred_adasyn = model_adasyn.predict(X_test_tfidf)

# evaluate the model
print(classification_report(y_test, y_pred_adasyn))


In summary, the main difference between ADASYN and SMOTE is in their approach to oversampling

- SMOTE generates synthetic samples uniformly across the feature space, 
- while ADASYN places more emphasis on generating synthetic samples in areas where the classification is difficult, thereby adapting the oversampling based on the dataset’s specific challenges. but the result doesnt change much on the class 0 scores overall

now the **SMOTENC** (Synthetic Minority Over-sampling Technique for Nominal and Continuous features) is an extension of the SMOTE algorithm specifically designed to handle datasets with both categorical and numerical features.

In [None]:
# generate more data so the model can learn more on the minority class

from imblearn.over_sampling import SMOTENC

smotenc = SMOTENC(categorical_features=[0], random_state=42)

X_train_smotenc, y_train_smotenc = smotenc.fit_resample(X_train_tfidf, y_train)

# train the model with the balanced data
model_smotenc = RandomForestClassifier()
model_smotenc.fit(X_train_smotenc, y_train_smotenc)

# make predictions
y_pred_smotenc = model_smotenc.predict(X_test_tfidf)

# evaluate the model
print(classification_report(y_test, y_pred_smotenc))


## Summary

### First Model (Logistic Regression):
|           | Precision | Recall | F1-Score | Support |
|-----------|-----------|--------|----------|---------|
| Class 0   | 0.49      | 0.16   | 0.24     | 290     |
| Class 1   | 0.91      | 0.96   | 0.94     | 3832    |
| Class 2   | 0.83      | 0.81   | 0.82     | 835     |
|-----------|-----------|--------|----------|---------|
| Accuracy  |           |        | 0.89     | 4957    |
| Macro Avg | 0.75      | 0.64   | 0.66     | 4957    |
| Weighted Avg | 0.87   | 0.89   | 0.88     | 4957    |


### Model After SMOTENC (Random Forest):
|           | Precision | Recall | F1-Score | Support |
|-----------|-----------|--------|----------|---------|
| Class 0   | 0.38      | 0.39   | 0.39     | 290     |
| Class 1   | 0.94      | 0.92   | 0.93     | 3832    |
| Class 2   | 0.78      | 0.85   | 0.81     | 835     |
|-----------|-----------|--------|----------|---------|
| Accuracy  |           |        | 0.87     | 4957    |
| Macro Avg | 0.70      | 0.72   | 0.71     | 4957    |
| Weighted Avg | 0.88   | 0.88   | 0.88     | 4957    |

- Precision: The precision values for class 0 decreased from 0.49 to 0.38 after applying SMOTENC. However, precision values for classes 1 and 2 remained relatively high in both models.

- Recall: The recall values for class 0 slightly increased from 0.16 to 0.39 after applying SMOTENC, indicating an improvement in capturing true positive instances for this class. Recall values for classes 1 and 2 remained stable in both models.

- F1-score: The F1-scores for class 0 increased from 0.24 to 0.39 after applying SMOTENC, reflecting the improvement in precision and recall for this class. F1-scores for classes 1 and 2 remained consistent in both models.

- Accuracy: The overall accuracy decreased slightly from 0.89 to 0.87 after applying SMOTENC.

In [None]:
## class weighting to adjust the imbalance class
model_weighted = RandomForestClassifier(class_weight='balanced')

model_weighted.fit(X_train_tfidf, y_train)

# make predictions
y_pred_weighted = model_weighted.predict(X_test_tfidf)

# evaluate the model
print(classification_report(y_test, y_pred_weighted))

currently several improvements that i made doesnt help the score for the class 0. from ChaDGPT recommends me to use Ensemble Methods, including bagging, boosting or stacking to combine multiple classifiers

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a bagging classifier
bagging_classifier = BaggingClassifier(DecisionTreeClassifier( max_depth=100,random_state=42,criterion='entropy'),
                                         n_estimators=100,random_state=42,)
# Train the bagging classifier
bagging_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_bagging = bagging_classifier.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred_bagging))

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Create an AdaBoost classifier
adaboost_classifier = AdaBoostClassifier(DecisionTreeClassifier( max_depth=100,random_state=42,criterion='entropy'),
                                         n_estimators=100,random_state=42,)

# Train the AdaBoost classifier
adaboost_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_adaboost = adaboost_classifier.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred_adaboost))

In [None]:
# viualize the best model performance

plt.figure(figsize=(10, 5))
sns.heatmap(pd.DataFrame(classification_report(y_test, y_pred_smotenc, output_dict=True)).iloc[:-1, :].T, annot=True)
plt.title('DecisionTreeClassifier with SMOTENC')
plt.show()


## Saving model

In [None]:
# Saving both the model and tfidf on the model folder 

pickle.dump(model_smotenc, open('model/model_smotenc.pkl', 'wb'))
pickle.dump(tfidf, open('model/tfidf.pkl', 'wb'))