In [15]:
#doing with logistic regression
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


In [14]:
# Load and preview the data
data = pd.read_csv("labeled_data.csv")
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate and Offensive Speech"})
data = data[["tweet", "labels"]]
print(data.head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   
5  !!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just...   
6  !!!!!!"@__BrighterDays: I can not just sit up ...   

                         labels  
0  No Hate and Offensive Speech  
1              Offensive Speech  
2              Offensive Speech  
3              Offensive Speech  
4              Offensive Speech  
5              Offensive Speech  
6              Offensive Speech  


In [4]:
# Initialize stopwords and stemmer
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuvib\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Define the clean function
def clean(text):
    text = str(text).lower()
    text = re.sub('[.?]', '', text)
    text = re.sub('https?://\S+|www.\S+', '', text)
    text = re.sub('<.?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w\d\w', '', text)
    text = [word for word in text.split(' ') if word not in stopwords_set]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

In [6]:
data["tweet"] = data["tweet"].apply(clean)

In [7]:
x = np.array(data["tweet"])
y = np.array(data["labels"])


cv = CountVectorizer()
X = cv.fit_transform(x)

from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizing text data using TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
# Train and evaluate the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.8954639931531972


In [10]:
# Testing the model with example inputs
test_inputs = [
    "You are too bad and I dont like your attitude",
    "It is really awesome",
    "fuck you",
    "you are an idiot",
    "you are killing me",
    "go to hell",
    "you are looking fucking awesome",
    "good morning"
]

In [11]:
for inp in test_inputs:
    inp_transformed = tfidf.transform([inp]).toarray()
    print(f"Input: {inp}")
    print(f"Logistic Regression Prediction: {model.predict(inp_transformed)}")
    print()

Input: You are too bad and I dont like your attitude
Logistic Regression Prediction: ['Offensive Speech']

Input: It is really awesome
Logistic Regression Prediction: ['No Hate and Offensive Speech']

Input: fuck you
Logistic Regression Prediction: ['Offensive Speech']

Input: you are an idiot
Logistic Regression Prediction: ['Offensive Speech']

Input: you are killing me
Logistic Regression Prediction: ['No Hate and Offensive Speech']

Input: go to hell
Logistic Regression Prediction: ['Offensive Speech']

Input: you are looking fucking awesome
Logistic Regression Prediction: ['No Hate and Offensive Speech']

Input: good morning
Logistic Regression Prediction: ['No Hate and Offensive Speech']



In [12]:
# Save the model
import pickle
pickle.dump(model, open('LR_hate_speech.pkl', 'wb'))

# Save the CountVectorizer
with open('count_vectorizer_LR.pkl', 'wb') as file:
    pickle.dump(cv, file)