In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import nltk
import string
from nltk.corpus import stopwords

In [2]:
# Download NLTK data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Read the dataset (Make sure the dataset.csv is in the same directory)
hate = pd.read_csv("dataset.csv") # hate is variable

In [4]:
hate.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [5]:
hate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [6]:
hate.describe()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


In [7]:
hate["labels"] = hate["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})


In [8]:
hate = hate[["tweet", "labels"]]

In [9]:
# installing necessary components for text cleaning
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

In [10]:
# Define a function to clean the text
def clean(text):
    text = str(text).lower()  # Lowercase the text
    text = re.sub('\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub('<.*?>+', '', text)  # Remove HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub('\n', '', text)  # Remove newlines
    text = re.sub('\w*\d\w*', '', text)  # Remove words containing numbers
    text = [word for word in text.split() if word not in stopword]  # Remove stopwords
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split()]  # Stemming the words
    text = " ".join(text)
    return text

  text = re.sub('\[.*?\]', '', text)  # Remove text in square brackets
  text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
  text = re.sub('\w*\d\w*', '', text)  # Remove words containing numbers


In [11]:
# Applying the cleaning function to the tweet column
hate["tweet"] = hate["tweet"].apply(clean)

In [12]:
# Define feature (X) and label (y)
x = np.array(hate["tweet"])
y = np.array(hate["labels"])


In [13]:
# Vectorize the text data using CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(x)


In [14]:
# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2529)

### Decision Tree Classifer

In [15]:
# train a Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [16]:
# accuracy
print("Model Accuracy: ", clf.score(X_test, y_test))

Model Accuracy:  0.8753194351042367


### Accuracy = 87.50

In [17]:

# function to detect hate speech
def hate_speech_detection(tweet):
    sample = tweet
    data = cv.transform([sample]).toarray()
    prediction = clf.predict(data)
    print("Prediction for the tweet: ", prediction[0])

##### Testing the function with an example tweet

In [18]:
hate_speech_detection("can you please help me")

Prediction for the tweet:  No Hate and Offensive


In [19]:
hate_speech_detection(" I got too much shit going on")

Prediction for the tweet:  Offensive Language
