In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
import re
import nltk
from nltk.util import pr
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words("english"))
import joblib
from google.colab import files

In [None]:
df = pd.read_csv("/content/twitter_data.csv")
print(df.head())

In [None]:
df['labels'] = df['class'].map({0:"Hate Speech Detected", 1:"Offensive language detected" , 2:"No hate and offensive speech"})
print(df.head())

In [None]:
df = df[['tweet' , 'labels']]
df.head()

In [None]:
def clean(text):
  text = str(text).lower()
  text = re.sub('\[.*?\]' , '' , text)
  text = re.sub('https?://\S+|www\..S+' , '' , text)
  text = re.sub('<.*?>+' , '' , text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)
  text = re.sub('\n' , '' , text)
  text = re.sub('\w*\d\w*' , '' , text)
  text = [word for word in text.split(' ') if word not in stopword]
  text = " ".join(text)
  text = [stemmer.stem(word) for word in text.split(' ')]
  text = " ".join(text)
  return text

df["tweet"] = df["tweet"].apply(clean)
print(df.head())

In [None]:
x = np.array(df["tweet"])
y = np.array(df["labels"])

cv = CountVectorizer()
x = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(x , y , test_size = 0.33 , random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Save the model to an H5 file
joblib_file = "hate_speech_model.h5"
joblib.dump(clf, joblib_file)
print(f"Model saved to {joblib_file}")

# Save the CountVectorizer
cv_file = "count_vectorizer.pkl"
joblib.dump(cv, cv_file)
print(f"CountVectorizer saved to {cv_file}")

# Download the model and vectorizer files
files.download(joblib_file)
files.download(cv_file)


In [None]:
test_data = "I will kill you"
df = cv.transform([test_data]).toarray()
print(clf.predict(df))

In [None]:
test_data = "you are awesome"
df = cv.transform([test_data]).toarray()
print(clf.predict(df))

In [None]:
test_data = "you are bad i don't like you"
df = cv.transform([test_data]).toarray()
print(clf.predict(df))