In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import GaussianNB,ComplementNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier,LocalOutlierFactor
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import re

In [None]:
df=pd.read_csv(r'C:\Users\Bhavya\Desktop\HateSpeechDataset\labeled_data.csv')
df.head()

In [None]:
def remove_and_extract_rt_username(text):
    rt_pattern = re.search(r'RT\s+@[\w]+:', text)
    if rt_pattern:
        extracted_pattern = rt_pattern.group()
        text_without_rt = text.replace(extracted_pattern, '')
        return text_without_rt, extracted_pattern
    else:
        return text, None

def extract_hashtags(text):
    hashtags = re.findall(r'#\w+', text)
    return hashtags

def remove_non_ascii(text):
    return text.encode('ascii', 'ignore').decode()

def extract_urls(text):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    return urls

def remove_urls(text):
    return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

def remove_punctuation(s):
    s = re.sub(r'[^\w\s]', '', s)
    return s

def remove_stopwords(s):
    s = ' '.join(word for word in s.split() if word not in stop_words)
    return s

def remove_symbols(s):
    s = re.sub(r'[^a-zA-Z\s]', '', s)
    return s

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def stem_text(s):
    s = ' '.join(stemmer.stem(word) for word in s.split())
    return s
def lemmatize_text(s):
    s = ' '.join(lemmatizer.lemmatize(word) for word in s.split())
    return s


In [None]:
df['Hashtags'] = df['tweet'].apply(extract_hashtags)
df['text_without_hashtag'] = df['tweet'].apply(lambda x: re.sub(r'#\w+', '', x))
df['text_without_hashtag_rt'], df['rt_username'] = zip(*df['text_without_hashtag'].apply(remove_and_extract_rt_username))
df['urls'] = df['text_without_hashtag_rt'].apply(extract_urls)
df['text_without_urls'] = df['text_without_hashtag_rt'].apply(remove_urls)
df['cleaned_text'] = df['text_without_hashtag_rt'].apply(remove_urls)
df['cleaned_text2']=df['cleaned_text'].str.replace('[^\w\s]','')
df['cleaned_text3']=df['cleaned_text2'].apply(str.lower)
df['cleaned_text4']=df['cleaned_text3'].apply(remove_symbols)
df['stemmed']=df['cleaned_text4'].apply(stem_text)
df['lemmatized']=df['cleaned_text4'].apply(lemmatize_text)

In [None]:
df.head()

In [None]:
x=df['cleaned_text4'].values
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)
y=df['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=51,stratify=y)
models = dict()
results=dict()
models['Logistic Regression'] = LogisticRegression(max_iter=5000)
models['Multinomial Naive Bayes'] = MultinomialNB()
models['Complement Naive Bayes'] = ComplementNB(force_alpha=True)
models['KMeans'] = KMeans(n_clusters=2, n_init=10, random_state=59)
models['XGB']= XGBClassifier(n_estimators=500)
models['Support Vector Machine'] = SVC(kernel = 'sigmoid', gamma='scale',probability=True)
models['Decision Tree'] = DecisionTreeClassifier(max_depth=100)
models['kNN'] = KNeighborsClassifier()
models['SGD']=SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None)
models['adaboost']=AdaBoostClassifier()
models['Random Forest'] = RandomForestClassifier(n_estimators=100)
print("Training Models:\n")
for model in models:
        models[model].fit(x_train,y_train)
        print("model "+str(model)+" trained")
print("Test Set Prediction:\n")
model_names,accuracies,precisions_w,recalls_w,f1_scores_w,precisions_m,recalls_m,f1_scores_m = [],[],[],[],[],[],[],[]
for i in models:
        print(i)
        y_pred=models[i].predict(x_test)
        print(classification_report(y_test,y_pred))
        accuracy = accuracy_score(y_test, y_pred)
        if i!='SGD':
                results[models[i]]=accuracy
        precision_w = precision_score(y_test, y_pred,average='weighted')
        recall_w = recall_score(y_test, y_pred,average='weighted')
        f1_w = f1_score(y_test, y_pred,average='weighted')
        precision_m = precision_score(y_test, y_pred,average='macro')
        recall_m = recall_score(y_test, y_pred,average='macro')
        f1_m = f1_score(y_test, y_pred,average='macro')
        model_names.append(i)
        accuracies.append(accuracy)
        precisions_w.append(precision_w)
        recalls_w.append(recall_w)
        f1_scores_w.append(f1_w)
        precisions_m.append(precision_m)
        recalls_m.append(recall_m)
        f1_scores_m.append(f1_m)
top_3_keys_bow = sorted(results, key=lambda k: results[k], reverse=True)[:3]
c1=top_3_keys_bow[0]
c2=top_3_keys_bow[1]
c3=top_3_keys_bow[2]
models['ensemble voting'] = VotingClassifier (estimators=[('clf1',c1), ('clf2',c2),('clf3',c3)], voting='soft')
models['ensemble voting'].fit(x_train, y_train)
y_pred = models['ensemble voting'].predict(x_test)
print('-'*20+'VotingClassifier'+'-'*20)
print(classification_report(y_test, y_pred,digits=5))
cf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision_w = precision_score(y_test, y_pred,average='weighted')
recall_w = recall_score(y_test, y_pred,average='weighted')
f1_w = f1_score(y_test, y_pred,average='weighted')
precision_m = precision_score(y_test, y_pred,average='macro')
recall_m = recall_score(y_test, y_pred,average='macro')
f1_m = f1_score(y_test, y_pred,average='macro')
model_names.append('ensemble voting')
accuracies.append(accuracy)
precisions_w.append(precision_w)
recalls_w.append(recall_w)
f1_scores_w.append(f1_w)
precisions_m.append(precision_m)
recalls_m.append(recall_m)
f1_scores_m.append(f1_m)
metrics_df = pd.DataFrame({
    'Model Name': model_names,
    'Accuracy': accuracies,
    'Precision Weighted': precisions_w,
    'Recall Weighted': recalls_w,
    'F1-Score Weighted': f1_scores_w,
    'Precision Macro': precisions_m,
    'Recall Macro': recalls_m,
    'F1-Score Macro': f1_scores_m
})
metrics_df.sort_values(by=['Accuracy'], ascending=False, inplace=True)
metrics_df.to_csv("Bow.csv")

In [None]:
x=df['cleaned_text4'].values
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(x)
y=df['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=51,stratify=y)
models = dict()
results=dict()
models['Linear Regression'] = LogisticRegression(max_iter=5000)
models['Multinomial Naive Bayes'] = MultinomialNB()
models['Complement Naive Bayes'] = ComplementNB(force_alpha=True)
models['KMeans'] = KMeans(n_clusters=2, n_init=10, random_state=59)
models['XGB']= XGBClassifier(n_estimators=500)
models['Support Vector Machine'] = SVC(kernel = 'sigmoid', gamma='scale',probability=True)
models['Decision Tree'] = DecisionTreeClassifier(max_depth=100)
models['kNN'] = KNeighborsClassifier()
models['SGD']=SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None)
models['adaboost']=AdaBoostClassifier()
models['Random Forest'] = RandomForestClassifier(n_estimators=100)
print("Training Models:\n")
for model in models:
        models[model].fit(x_train,y_train)
        print("model "+str(model)+" trained")
print("Test Set Prediction:\n")
model_names,accuracies,precisions_w,recalls_w,f1_scores_w,precisions_m,recalls_m,f1_scores_m = [],[],[],[],[],[],[],[]
for i in models:
        print(i)
        y_pred=models[i].predict(x_test)
        print(classification_report(y_test,y_pred))
        accuracy = accuracy_score(y_test, y_pred)
        if i!='SGD':
                results[models[i]]=accuracy
        precision_w = precision_score(y_test, y_pred,average='weighted')
        recall_w = recall_score(y_test, y_pred,average='weighted')
        f1_w = f1_score(y_test, y_pred,average='weighted')
        precision_m = precision_score(y_test, y_pred,average='macro')
        recall_m = recall_score(y_test, y_pred,average='macro')
        f1_m = f1_score(y_test, y_pred,average='macro')
        model_names.append(i)
        accuracies.append(accuracy)
        precisions_w.append(precision_w)
        recalls_w.append(recall_w)
        f1_scores_w.append(f1_w)
        precisions_m.append(precision_m)
        recalls_m.append(recall_m)
        f1_scores_m.append(f1_m)
top_3_keys_bow = sorted(results, key=lambda k: results[k], reverse=True)[:3]
c1=top_3_keys_bow[0]
c2=top_3_keys_bow[1]
c3=top_3_keys_bow[2]
models['ensemble voting'] = VotingClassifier (estimators=[('clf1',c1), ('clf2',c2),('clf3',c3)], voting='soft')
models['ensemble voting'].fit(x_train, y_train)
y_pred = models['ensemble voting'].predict(x_test)
print('-'*20+'VotingClassifier'+'-'*20)
print(classification_report(y_test, y_pred,digits=5))
cf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision_w = precision_score(y_test, y_pred,average='weighted')
recall_w = recall_score(y_test, y_pred,average='weighted')
f1_w = f1_score(y_test, y_pred,average='weighted')
precision_m = precision_score(y_test, y_pred,average='macro')
recall_m = recall_score(y_test, y_pred,average='macro')
f1_m = f1_score(y_test, y_pred,average='macro')
model_names.append('ensemble voting')
accuracies.append(accuracy)
precisions_w.append(precision_w)
recalls_w.append(recall_w)
f1_scores_w.append(f1_w)
precisions_m.append(precision_m)
recalls_m.append(recall_m)
f1_scores_m.append(f1_m)
metrics_df = pd.DataFrame({
    'Model Name': model_names,
    'Accuracy': accuracies,
    'Precision Weighted': precisions_w,
    'Recall Weighted': recalls_w,
    'F1-Score Weighted': f1_scores_w,
    'Precision Macro': precisions_m,
    'Recall Macro': recalls_m,
    'F1-Score Macro': f1_scores_m
})
metrics_df.sort_values(by=['Accuracy'], ascending=False, inplace=True)
metrics_df.to_csv("TFIDF.csv")

In [None]:
embeddings_index = {}
with open(r"C:\Users\Bhavya\Desktop\HateSpeechDataset\glove2.csv", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
texts=df['cleaned_text4'].astype(str)
embeddings = []
n=0
ex=[]
for text in texts:
    text_embedding = []
    for word in text:
        if word in embeddings_index:
            text_embedding.append(embeddings_index[word])
    if len(text_embedding) > 0:
        text_embedding = np.mean(text_embedding, axis=0)
        embeddings.append(text_embedding)
    else:
        embeddings.append(np.zeros(300))
    n+=1
embeddings = np.array(embeddings)
len(embeddings)

y=df['class']
x_train,x_test,y_train,y_test=train_test_split(embeddings,y,test_size=0.3,stratify=y,random_state=51)
models = dict()
results=dict()
model_names,accuracies,precisions_w,recalls_w,f1_scores_w,precisions_m,recalls_m,f1_scores_m = [],[],[],[],[],[],[],[]
models['Linear Regression'] = LogisticRegression(max_iter=5000)
models['KMeans'] = KMeans(n_clusters=2, n_init=10, random_state=59)
models['XGB']= XGBClassifier(n_estimators=5000)
models['Support Vector Machine'] = SVC(kernel = 'sigmoid', gamma='scale',probability=True)
models['Decision Tree'] = DecisionTreeClassifier(max_depth=100)
models['kNN'] = KNeighborsClassifier()
models['SGD']=SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None)
for i in models:
    models[i].fit(x_train,y_train)
    print('-'*20+i+'-'*20)
    y_pred=models[i].predict(x_test)
    if i!='SGD' and i!='KMeans':
        results[models[i]]=accuracy_score(y_test, y_pred)
    print(classification_report(y_test,y_pred,digits=5))
    model_names.append(i)
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions_w.append(precision_score(y_test, y_pred,average='weighted'))
    recalls_w.append(recall_score(y_test, y_pred,average='weighted'))
    f1_scores_w.append(f1_score(y_test, y_pred,average='weighted'))
    precisions_m.append(precision_score(y_test, y_pred,average='macro'))
    recalls_m.append(recall_score(y_test, y_pred,average='macro'))
    f1_scores_m.append(f1_score(y_test, y_pred,average='macro'))
top_3_keys_tf = sorted(results, key=lambda k: results[k], reverse=True)[:3]
print("Top 3 keys with the highest values:")
print(top_3_keys_tf)
c1=top_3_keys_tf[0]
c2=top_3_keys_tf[1]
c3=top_3_keys_tf[2]
i='ensemble voting'
models['ensemble voting'] = VotingClassifier (estimators=[('clf1',c1), ('clf2',c2),('clf3',c3)], voting='soft')
models['ensemble voting'].fit(x_train, y_train)
y_pred = models['ensemble voting'].predict(x_test)
y_pred=models[i].predict(x_test)
print(classification_report(y_test,y_pred,digits=5))
model_names.append(i)
accuracies.append(accuracy_score(y_test, y_pred))
precisions_w.append(precision_score(y_test, y_pred,average='weighted'))
recalls_w.append(recall_score(y_test, y_pred,average='weighted'))
f1_scores_w.append(f1_score(y_test, y_pred,average='weighted'))
precisions_m.append(precision_score(y_test, y_pred,average='macro'))
recalls_m.append(recall_score(y_test, y_pred,average='macro'))
f1_scores_m.append(f1_score(y_test, y_pred,average='macro'))
metrics_df = pd.DataFrame({
'Model Name': model_names,
'Accuracy': accuracies,
'Precision Weighted': precisions_w,
'Recall Weighted': recalls_w,
'F1-Score Weighted': f1_scores_w,
'Precision Macro': precisions_m,
'Recall Macro': recalls_m,
'F1-Score Macro': f1_scores_m
})
metrics_df.sort_values(by=['Accuracy'], ascending=False, inplace=True)
metrics_df.to_csv("glove.csv", index=False)