In [1]:
%%capture
import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

In [2]:
%%capture
#Get Original dataset
!mkdir -p data
!git clone 'https://github.com/shariqfz/Fake-News-Detection-Text-Classification.git'

In [3]:
hate_speech_path = "/content/Fake-News-Detection-Text-Classification/data/HateSpeech/hate_speech_data.csv"
data = pd.read_csv(hate_speech_path)
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
"""
class meanings:

0 - hate speech
1 - offensive language
2 - neither

"""

'\nclass meanings:\n  \n0 - hate speech\n1 - offensive language\n2 - neither\n\n'

In [5]:
len(data)

24783

In [6]:
arr = [0]*3
for val in data['class'].values:
  arr[val] += 1
arr

[1430, 19190, 4163]

In [7]:
class_0_df = data[data['class'] == 0][:1400]
class_1_df = data[data['class'] == 1][:1400]
class_2_df = data[data['class'] == 2][:1400]

class_0_df = class_0_df[["tweet", "class"]]
class_1_df = class_1_df[["tweet", "class"]]
class_2_df = class_2_df[["tweet", "class"]]

combined_data = pd.concat([class_0_df, class_1_df, class_2_df])
combined_data.reset_index(inplace=True, drop=True)
print(len(combined_data))
combined_data.head()

4200


Unnamed: 0,tweet,class
0,"""@Blackman38Tide: @WhaleLookyHere @HowdyDowdy1...",0
1,"""@CB_Baby24: @white_thunduh alsarabsss"" hes a ...",0
2,"""@DevilGrimz: @VigxRArts you're fucking gay, b...",0
3,"""@MarkRoundtreeJr: LMFAOOOO I HATE BLACK PEOPL...",0
4,"""@NoChillPaz: ""At least I'm not a nigger"" http...",0


In [8]:
# Data cleaning functions
def remove_twitter_handles(text):
    return re.sub(r'@\w+', '', text)

combined_data['tweet'] = combined_data.tweet.apply(remove_twitter_handles)

def remove_URLs(text):
  processed_sample = re.sub(r'https?://\S+|www\.\S+', '', text)                         # Remove http urls
  processed_sample = re.sub(r"pic\.twitter\.com/[a-zA-Z0-9_]+", '', processed_sample)   # Remove pic.twitter.com urls
  return processed_sample

combined_data['tweet'] = combined_data.tweet.apply(remove_URLs)

def remove_html(text):
  processed_sample = re.sub(r'<.*?>', '', text)
  return processed_sample

combined_data['tweet'] = combined_data.tweet.apply(remove_html)

def remove_symbols_and_numerals(text):
  # Regular expression pattern for matching numeric characters, punctuation marks, and symbols including #,@
  cleaned_text = re.sub(r"[^\w\s]", '', text)
  cleaned_text = re.sub(r"\d", '', cleaned_text)  # Remove numeric characters
  return cleaned_text

combined_data['tweet'] = combined_data.tweet.apply(remove_symbols_and_numerals)

def remove_emoji(text):
    emoji_pattern = re.compile(
    "["
        u"\U0001F600-\U0001F64F" #emoticons
        u"\U0001F300-\U0001F5FF" #symbols & pictographs
        u"\U0001F680-\U0001F6FF" #transport & map symbols
        u"\U0001F1E0-\U0001F1FF" #FLAGS on (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", text)

combined_data['tweet'] = combined_data.tweet.apply(remove_emoji)

# Remove stopwords
nltkstopwords = set(nltk.corpus.stopwords.words("english"))
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in nltkstopwords]
    return " ".join(text)

combined_data['tweet'] = combined_data.tweet.apply(remove_stopwords)

combined_data = combined_data.sample(frac=1).reset_index(drop=True)
combined_data.head()

Unnamed: 0,tweet,class
0,niggas always yelling fuck bitches get money m...,1
1,heard looking candy man bitch,1
2,awww meat meat feelinsi ainn n feelings bitch,1
3,well gahdamn rt wooooow good job mommy cutest ...,0
4,rt black guy school asked colored printers lib...,0


In [9]:
#save DataFrame to csv file
combined_data.to_csv("hate_speech_cleaned_and_stopword_removed_combined_data.csv")