In [None]:
%%capture
import re
import pandas as pd
import nltk
nltk.download('stopwords')

In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))

Num GPUs Available:  0


In [None]:
%%capture
#Get Original dataset
!mkdir -p data
!git clone 'https://github.com/shariqfz/Fake-News-Detection-Text-Classification.git'
!tar -zxvf '/content/Fake-News-Detection-Text-Classification/data/FakeNewsData.tar.gz'
!mv ./FakeNewsData/Fake.csv ./FakeNewsData/True.csv ./data
!rm -r /content/FakeNewsData /content/sample_data

In [None]:
fake_set = pd.read_csv('/content/data/Fake.csv')
real_set = pd.read_csv('/content/data/True.csv')
# real_set

In [None]:
fake_set.head(3)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


In [None]:
CLASS_NAMES = ["Fake", "Real"]
MAP_LABLE_TO_CLASS = { 0 : "Fake", 1 : "Real"}

#Add class labels to each dataframe
real_set["Label"] = 1
fake_set["Label"] = 0

#Combine real_set and fake_set into a single dataframe
combined_set = pd.concat([real_set, fake_set])
combined_set.reset_index(inplace=True, drop=True)
combined_set[21416:21418]

Unnamed: 0,title,text,subject,date,Label
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1
21417,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0


In [None]:
# Since real_set contains texts like 'WASHINGTON (Reuters) - ' in the beginning of the 'text' column values,
# but these texts are not present in the fake_set, so it could pose a possible data leak, hence we remove it from the real_set also.

def remove_city_and_media_house_name(sample):
  # Remove city name and news company name from the start of each text sample
  processed_sample = re.sub(r'^.*?\(.*?\)\s-\s', '', sample)
  return processed_sample

combined_set['text'] = combined_set.text.apply(remove_city_and_media_house_name)

def merge_title_and_text(df):
  #Merger title and text columns into single 'text' column
  df['text'] = df['title'] + '. ' + df['text']
  return df

combined_set = merge_title_and_text(combined_set)

#Remove irrelevant columns:
combined_set = combined_set.drop(["title", "subject", "date"], axis=1)
combined_set.head(3)

Unnamed: 0,text,Label
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1


In [None]:
def remove_URLs(text):
  processed_sample = re.sub(r'https?://\S+|www\.\S+', '', text)                         # Remove http urls
  processed_sample = re.sub(r"pic\.twitter\.com/[a-zA-Z0-9_]+", '', processed_sample)   # Remove pic.twitter.com urls
  return processed_sample

combined_set['text'] = combined_set.text.apply(remove_URLs)

def remove_html(text):
  processed_sample = re.sub(r'<.*?>', '', text)
  return processed_sample

combined_set['text'] = combined_set.text.apply(remove_html)

def remove_symbols_and_numerals(text):
  # Regular expression pattern for matching numeric characters, punctuation marks, and symbols including #,@
  cleaned_text = re.sub(r"[^\w\s]", '', text)
  cleaned_text = re.sub(r"\d", '', cleaned_text)  # Remove numeric characters
  return cleaned_text

combined_set['text'] = combined_set.text.apply(remove_symbols_and_numerals)


In [None]:
# Before stemming and stop-word removal:
combined_set.loc[21419, 'text']

' Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People In The Eye On Friday it was revealed that former Milwaukee Sheriff David Clarke who was being considered for Homeland Security Secretary in Donald Trump s administration has an email scandal of his ownIn January there was a brief runin on a plane between Clarke and fellow passenger Dan Black who he later had detained by the police for no reason whatsoever except that maybe his feelings were hurt Clarke messaged the police to stop Black after he deplaned and now a search warrant has been executed by the FBI to see the exchangesClarke is calling it fake news even though copies of the search warrant are on the Internet I am UNINTIMIDATED by lib media attempts to smear and discredit me with their FAKE NEWS reports designed to silence me  the former sheriff tweeted  I will continue to poke them in the eye with a sharp stick and bitch slap these scum bags til they get it I have been attacked by better people than 

In [None]:
# Stemming text using nltk
def stemming_words(text):
    ps = nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

combined_set['text'] = combined_set.text.apply(stemming_words)

In [None]:
# Remove stopwords
nltkstopwords = set(nltk.corpus.stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in nltkstopwords]
    return " ".join(text)

combined_set['text'] = combined_set.text.apply(remove_stopwords)

In [None]:
#save DataFrame to csv file
combined_set.to_csv("stemmed_and_stopword_removed_combined_set.csv")

In [None]:
# read prepocessed data
combined_set = pd.read_csv("/content/Fake-News-Detection-Text-Classification/preprocessed_data/stemmed_and_stopword_removed_combined_set.csv")

In [None]:
# After stemming and stop-word removal
combined_set.loc[21419, 'text']

'sheriff david clark becom internet joke threaten poke peopl eye friday wa reveal former milwauke sheriff david clark wa consid homeland secur secretari donald trump administr ha email scandal hi ownin januari wa brief runin plane clark fellow passeng dan black later detain polic reason whatsoev except mayb hi feel hurt clark messag polic stop black deplan search warrant ha execut fbi see exchangesclark call fake news even though copi search warrant internet unintimid lib media attempt smear discredit fake new report design silenc former sheriff tweet continu poke eye sharp stick bitch slap scum bag til get attack better peopl maga unintimid lib media attempt smear discredit fake new report design silenc continu poke eye sharp stick bitch slap scum bag til get attack better peopl maga david clark jr sheriffclark decemb stop therebreak new ly lib media make fake new smear antidot go right punch nose make tast blood noth get bulli like ly lib media attent better give tast blood neverback

## BERT HATE SPEECH DETECTOR

In [None]:
# Model Callbacks
model_name = "BERTHateSpeechDetector"
MODEL_CALLBACKS = [ModelCheckpoint(model_name, save_best_only=True)]

NameError: ignored