## Initializations

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score
import numpy as np
import pandas as pd

In [None]:
from bs4 import BeautifulSoup
import requests
import string

In [None]:
header = {
  "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}

In [None]:
from google.colab import auth
auth.authenticate_user()

project_id = 'proven-signal-345720'

## Common Functions

In [None]:
# Upload Function
from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build
gcs_service = build('storage', 'v1')

bucket_name = 'website_crawling'

def uploadFile(fileName):
  media = MediaFileUpload('/content/' + fileName, 
                          mimetype='text/plain',
                          resumable=True)

  request = gcs_service.objects().insert(bucket=bucket_name, 
                                        name=fileName,
                                        media_body=media)

  response = None
  while response is None:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    _, response = request.next_chunk()

  print('Upload complete')  
  print('https://console.cloud.google.com/storage/browser?project={}'.format(project_id)) 

In [None]:
txt = "AI will completely automate the network within five years, Juniper CEO Rami Rahim boasted during the company’s Global Summit this week. “I truly believe that just as there is this need today for a self-driving automobile, the future is around a self-driving network where humans literally have to do nothing,” he said. “It's probably weird for people to hear the CEO of a networking company say that… but that's exactly what we should be wishing for.” Rahim believes AI-driven automation is the latest phase in computer networking’s evolution, which began with the rise of TCP/IP and the internet, was accelerated by faster and more efficient silicon, and then made manageable by advances in software. “Collectively, we as an industry are sitting on a goldmine of information, and yet it is untapped,” Rahim said. “We as a planet are accumulating data at an unprecedented rate. Something like 90 percent of all the data in the world today has been accumulated in just the last two years alone.”"


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.stem import PorterStemmer
# porter=PorterStemmer()

from nltk.stem.snowball import SnowballStemmer
englishStemmer=SnowballStemmer("english")

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer

# remove stopwords
def removeStopWords(sentence):
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(sentence)
  filtered_sentence = [w.lower() for w in word_tokens if not w.lower() in stop_words]

  # filtered_sentence to single string
  newSentence = ""

  for word in filtered_sentence:
    newSentence += word + " "

  newSentence = newSentence.translate(newSentence.maketrans("", "", string.punctuation + "“”’0123456789"))

  return newSentence


# tokenize the sentence
def tokenize(sentence):
  
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(sentence)
  sequences = tokenizer.texts_to_sequences(sentence)


  word_index = tokenizer.word_index
  bow = {}
  for key in word_index:
      bow[key] = sequences[0].count(word_index[key])

  return bow

# print(tokenize())

def stemSentence(sentence):
  token_words=word_tokenize(sentence)
  stem_sentence=[]
  for word in token_words:
    w = (wordnet_lemmatizer.lemmatize(word, pos="n"))
    stem_sentence.append(englishStemmer.stem(w))
    stem_sentence.append(" ")
  return "".join(stem_sentence)

def mostFrequent(List):
  counter = 0
  num = List[0]
    
  for i in List:
    curr_frequency = List.count(i)
    if(curr_frequency> counter):
      counter = curr_frequency
      num = i

  return num

def mostFrequentList(List):
  counter = 0
  num = []
    
  for i in List:
    curr_frequency = List.count(i)
    if(curr_frequency> counter):
      counter = curr_frequency
      num = [i]
    elif(curr_frequency == counter and i not in num):
      num.append(i)

  return num

def scrapeAll(url):

  newsList = []
  html_text = requests.get(url, headers = header).text
  soup = BeautifulSoup(html_text, "lxml")
  allDivs = soup.find_all("a") # find all "a" tags
  # allDivs = soup.find_all("div") # find all "div" tags

  classListArr = []

  for div in allDivs: 
    # if(div.attrs.get("class") and len(div.text.strip()) > 0 and div.attrs.get("class")[0].find("image") == -1 and div.attrs.get("class")[0].find("img") == -1):
    if(div.attrs.get("class") and len(div.text.strip()) > 0):
      classListArr.append(div.attrs.get("class")[0])


  # print(classListArr)
  commonClass = mostFrequent(classListArr) # get the most common class name
  # print(commonClass)
  allNews = soup.find_all("a", class_ = commonClass) # get all the "a" tags with most common class name
  # allNews = soup.find_all("a", class_ = commonClass) # get all the "div" tags with most common class name

  # for new in allNews:
  #   print(new)

  # print(allTitles)


  for news in allNews:
    dic = {}
    newsText = news.text.strip()
    newsLink = news.get("href")
    if(newsLink.find("http") == -1): # if link is broken
      newUrl = url
      if(len(url.split("/")) > 3):
        # newUrl = url[:url.rfind("/")]
        newUrl = url.split("/")[0] + "//" + url.split("/")[2]
      newsLink = newUrl + newsLink

    # dic["text"] = newsText
    dic["header"] = newsText
    dic["link"] = newsLink
    newsList.append(dic)
    # print(f'text: {newsText} \nlink: {newsLink}')
    # print(f'text: {newsText}')
    # print(f'link: {newsLink}\n')

  return newsList

def take_second(elem):
    return elem[1]

def sortWordOccurences(sentence):
  x = removeStopWords(sentence)
  x = x.translate(x.maketrans("", "", string.punctuation + "“”’0123456789"))
  x= tokenize([x])
  x = sorted(x.items(), reverse = True, key = take_second)

  return x

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# x = stemSentence("This, however has a flaw. If we are computing probability for a word which is in our vocabulary V but not in a specific class, the probability for that pair will be 0. But since we multiply all feature likelihoods together, zero probabilities will cause the probability of the entire class to be zero as well. For this we need to add a smoothing technique. Smoothing techniques are popular in the language processing algorithms. Without getting too much into them, the technique we will be using is the Laplace one which consists in adding + 1 to our calculations. The formula will end up looking like this:")
# x = x.translate(x.maketrans("", "", string.punctuation + "“”0123456789"))
# x.split(" ")

In [None]:
def tokenizeNewsText(url):
  data = requests.get(url, headers = header).text
  new_soup = BeautifulSoup(data, "lxml")
  ps = new_soup.find_all("p")

  myStr = ""

  for p in ps:
    # print(p.text)
    myStr += p.text + " " 


  myStr = myStr.translate(myStr.maketrans("", "", string.punctuation + "“”0123456789"))

  sentence = removeStopWords(myStr)
  sentence = stemSentence(sentence)
  return tokenize([sentence])

  # print(stemSentence(sentence))
  # print("\n" + sentence)
  # print("\n\n", myStr)

In [None]:
import re

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ',sentence)
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def processText(text):
  text = text.lower()
  text = cleanHtml(text)
  text = cleanPunc(text)
  text = keepAlpha(text)
  print(text)
  return text

processText(txt)
print(removeStopWords(txt.lower()))

ai will completely automate the network within five years juniper ceo rami rahim boasted during the company s global summit this week  i truly believe that just as there is this need today for a self driving automobile the future is around a self driving network where humans literally have to do nothing   he said  its probably weird for people to hear the ceo of a networking company say that  but thats exactly what we should be wishing for   rahim believes ai driven automation is the latest phase in computer networking s evolution which began with the rise of tcp ip and the internet was accelerated by faster and more efficient silicon and then made manageable by advances in software  collectively we as an industry are sitting on a goldmine of information and yet it is untapped   rahim said  we as a planet are accumulating data at an unprecedented rate something like   percent of all the data in the world today has been accumulated in just the last two years alone
ai completely automate

## DARKREADING

In [None]:
# DARKREADING.COM

def scrapeDarkReading():
  newsList = []
  html_text = requests.get('https://www.theregister.com/security/', headers = header).text
  soup = BeautifulSoup(html_text, "lxml")
  allTitles = soup.find_all("a", class_ = "story_link")

  for titles in allTitles:
    dic = {}
    title_link = "https://www.theregister.com" + titles["href"]
    title_text = titles.text
    dic["title"] = title_text
    dic["link"] = title_link
    newsList.append(dic)

  # print(f'text: {title_text} \nlink: {title_link}')
  return newsList

print(scrapeDarkReading())

[{'title': '\n\n\nSo, the US, China, and Russia walk into an infosec conference\nSuffice to say things got a little awkward\n\nCSO19 Oct 2022 | 2\n\n', 'link': 'https://www.theregister.com/2022/10/19/singapore_international_cyber_week/'}, {'title': '\n\n\nTear in Microsoft Azure Service Fabric can give attackers full admin privileges\nOrca Security disclosed the bug, and older versions remain vulnerable\n\nResearch19 Oct 2022 | \n\n', 'link': 'https://www.theregister.com/2022/10/19/azure_service_fabric_vulnerability/'}, {'title': '\n\n\nThe infinite beauty of the hive mind\n\nWebinar Looking at the future of crowdsourced security\nWebinar\n\n', 'link': 'https://www.theregister.com/2022/10/19/the_infinite_beauty_of_the/'}, {'title': "\n\n\nGermany stands down cyber boss over Russian ties\nInvolvement with lobby group that welcomed Putin's pals presses buttons\n\nSecurity19 Oct 2022 | 8\n\n", 'link': 'https://www.theregister.com/2022/10/19/germany_stands_down_cyber_boss/'}, {'title': "\n

In [None]:
url = "https://research.checkpoint.com/"
mainLink = url.split("/")[2]

htmlText = requests.get(url, headers = header).text
soup = BeautifulSoup(htmlText, "lxml")
print(soup)

tags = soup.find_all("a")

news = []

for tag in tags:
  dic = {}
  link = tag["href"]
  header = tag.attrs.get("h3")
  print(header)
  if(link.find("https:") == -1):
    link = "https:" + link
  dic["header"] = header
  dic["link"] = link
  news.append(dic)



# news = [l for l in news if l.find(mainLink) != -1]

# print(news)

## Data Frame Initialization

In [None]:
categoryMap = {"security": 1, "non-security": 0}

# df = pd.DataFrame(columns = ["title", "first two paragraph", "url", "date", "stemmed text", "category"])
mainDf = pd.DataFrame()

## Bleeping Computer

In [None]:
# BLEEPINGCOMPUTER.COM

def tokenizeBleepingComputer(url):
  data = requests.get(url, headers = header).text
  new_soup = BeautifulSoup(data, "lxml")
  body = new_soup.find_all("div", class_ = "articleBody")

  newsText = ""


  for txt in body:
    newsText += txt.text.strip();

  newsText = newsText.translate(newsText.maketrans("", "", string.punctuation + "“”0123456789"))

  sentence = removeStopWords(newsText)
  sentence = stemSentence(sentence)
  return sentence
  # return sentence.split(" ")
  # return tokenize([sentence])

def scrapeBleepingComputer(page):
  html_text = requests.get('https://bleepingcomputer.com/' + page, headers = header).text
  soup = BeautifulSoup(html_text, "lxml")
  # print(soup)
  allNews = soup.find_all("div", class_ = "bc_latest_news_text")
  
  df = pd.DataFrame()

  for news in allNews:
    dic = {}
    date = news.ul.contents[2].text
    category = news.div.a.text
    title = news.h4.a
    firstParagraph = news.p.text
    title_text = title.text
    title_link = title["href"]
    dic["title"] = title_text
    dic["first paragraph"] = firstParagraph
    dic["url"] = title_link
    dic["date"] = date
    if(category == "Security"):
      dic["category"] = "security"
    else:
      dic["category"] = "non-security"
    dic["stemmed text"] = tokenizeBleepingComputer(title_link)
    df = df.append(dic, ignore_index = True)
  return df

In [None]:
for i in range(100):
  if(i == 0):
    df = scrapeBleepingComputer("")
  else:
    df = scrapeBleepingComputer("page/" + str(i + 1) + "/")
  mainDf = mainDf.append(df, ignore_index= True)

## The Register

In [None]:
# THEREGISTER.COM

def tokenizeTheRegister(url):
  data = requests.get(url, headers = header).text
  new_soup = BeautifulSoup(data, "lxml")
  body = new_soup.find_all("div", id = "body")

  newsText = ""

  for txt in body:
    newsText += txt.text.strip()
  
  newsText = newsText.translate(newsText.maketrans("", "", string.punctuation + "“”0123456789"))

  sentence = removeStopWords(newsText)
  sentence = stemSentence(sentence)
  return sentence
  # return sentence.split(" ")
  # return tokenize([sentence])

def scrapeTheRegister(earlier):
  df = pd.DataFrame()
  html_text = requests.get('https://www.theregister.com/' + earlier, headers = header).text
  soup = BeautifulSoup(html_text, "lxml")
  allNews = soup.find_all("div", class_ = "time_comments")

  for new in allNews:
    # print(new.contents[2].get("title"))

    dic = {}

    dic["title"] = new.parent.h4.text
    dic["first paragraph"] = new.parent.div.text
    dic["url"] = "https://www.theregister.com" + new.parent.parent.get("href")
    spans = new.find_all("span")
    dic["date"] = spans[1].text if len(spans) > 1 else "No Date"
    if(spans[0].text == "Security" or spans[0].text == "Cyber-crime" or spans[0].text == "Patches" or spans[0].text == "Research" or spans[0].text == "CSO"):
      dic["category"] = spans[0].text
    else:
      dic["category"] = "non-security"
    dic["stemmed text"] = tokenizeTheRegister(dic["url"])
    df = df.append(dic, ignore_index = True)

  return df

In [None]:
for i in range(100):
  if(i == 0):
    df = scrapeTheRegister("")
  else:
    df = scrapeTheRegister("earlier/" + str(i + 1))
  mainDf = mainDf.append(df, ignore_index= True)

## 

In [None]:
# SECURITY AFFAIRS

def tokenizeSecurityAffairs(url):
  data = requests.get(url, headers = header).text
  new_soup = BeautifulSoup(data, "lxml")
  body = new_soup.find_all("div", class_ = "post_inner_wrapper")

  newsText = ""


  for txt in body:
    for tag in txt.find_all("p"):
      newsText += tag.text + "\n"

    newsText = newsText.replace("Pierluigi", "")
    newsText = newsText.replace("Paganini", "")
    newsText = newsText.replace("Follow me on Twitter: @securityaffairs and Facebook", "")
    newsText = newsText.replace("Please vote for Security Affairs as the best European Cybersecurity Blogger Awards 2022 – VOTE FOR YOUR WINNERSVote for me in the sections “The Underdogs – Best Personal (non-commercial) Security Blog” and “The Tech Whizz – Best Technical Blog” and others of your choice.To nominate, please visit: https://docs.google.com/forms/d/e/1FAIpQLSfxxrxICiMZ9QM9iiPuMQIC-IoM-NpQMOsFZnJXrBQRYJGCOw/viewform", "")

  newsText = newsText.translate(newsText.maketrans("", "", string.punctuation + "“”0123456789_-''"))

  sentence = removeStopWords(newsText)
  sentence = stemSentence(sentence)
  return sentence
  # return sentence.split(" ")
  # return tokenize([sentence])

def scrapeSecurityAffairs(page):
  df = pd.DataFrame()
  html_text = requests.get('https://securityaffairs.co/wordpress/' + page, headers = header).text
  soup = BeautifulSoup(html_text, "lxml")
  allNews = soup.find_all("div", class_ = "post_wrapper")

  for news in allNews:
    dic = {}
    info = news.find_all("div", class_="post_detail large_space")
    date = info[0].a.text
    categories =[ category.text for category in info[0].a.find_next_siblings("a")[1:]]
    title = news.h3.a
    title_text = title.text
    title_link = title["href"]
    dic["title"] = title_text

    firstP = news.find("div", class_ = "post_wrapper_inner")
    dic["first paragraph"] = firstP.p.text.strip()
    dic["url"] = title_link
    dic["date"] = date
    if("Security" in categories):
      dic["category"] = "security"
    else:
      dic["category"] = "non-security"
    dic["stemmed text"] = tokenizeSecurityAffairs(title_link)
    # print(dic)
    df = df.append(dic, ignore_index = True)
  
  return df

In [None]:
for i in range(100):
  if(i == 0):
    df = scrapeSecurityAffairs("")
  else:
    df = scrapeSecurityAffairs("page/" + str(i + 1) + "/")
  mainDf = mainDf.append(df, ignore_index = True)

In [None]:
mainDf

In [None]:
mainDf.to_csv("df.csv", index=False)

## Packet Storm

In [None]:
# PACKET STORM

def tokenizeTheRegister(url):
  data = requests.get(url, headers = header).text
  new_soup = BeautifulSoup(data, "lxml")
  body = new_soup.find_all("body")
  # print(body[0].text)

  # newsText = ""

  # for txt in body:
  #   newsText += txt.text.strip()
  
  # newsText = newsText.translate(newsText.maketrans("", "", string.punctuation + "“”0123456789"))

  # sentence = removeStopWords(newsText)
  # sentence = stemSentence(sentence)
  # return sentence
  return body[0].text

def scrapePacketStorm(earlier = ""):
  df = pd.DataFrame()
  html_text = requests.get('https://packetstormsecurity.com/news/' + earlier, headers = header).text
  soup = BeautifulSoup(html_text, "lxml")
  allNews = soup.find_all("dl", class_ = "news")

  # allNews = allNews[0: 4]

  # for new in allNews:
  for new in allNews:
    # print(new)
    # print(new.find_all("dd", class_ = "tags")[0].find_all("a"))
    # print(new.find_all("dd", class_ = "tags"))

    dic = {}

    dic["title"] = new.dt.text
    # dic["first paragraph"] = new.parent.div.text
    dic["url"] = "https://packetstormsecurity.com" + new.dt.a.get("href")

    dic["date"] = new.find_all("dd", class_ = "datetime")[0].a.text
    
    dic["category"] = ""
    for a in new.find_all("dd", class_ = "tags")[0].find_all("a"):
      if a.text != "headline":
        dic["category"] += a.text + ", "
    dic["category"] = dic["category"][0: len(dic["category"]) - 2]

    # dic["stemmed text"] = tokenizeTheRegister(dic["url"])
    df = df.append(dic, ignore_index = True)

  return df

In [None]:
df = scrapePacketStorm("")
df.style

Scrape Packet Storm Categories & Mapping

In [None]:
def scrapeCategories(earlier = ""):
  cat = []
  html_text = requests.get('https://packetstormsecurity.com/news/tags/' + earlier, headers = header).text
  soup = BeautifulSoup(html_text, "lxml")
  categories = soup.find_all("div", class_ = "static")
  for ul in categories:
    for li in ul.findAll('li'):
      cat.append(li.a.text)
  return cat
categories = scrapeCategories("")
print(categories)

['0 Day', 'Adobe', 'Afghanistan', 'Africa', 'Algeria', 'Amazon', 'Anonymous', 'Apache', 'Apple', 'Argentina', 'Australia', 'Backdoor', 'Bank', 'BlackBerry', 'Botnet', 'Brazil', 'Britain', 'BSD', 'Canada', 'Car', 'Caribbean', 'Censorship', 'China', 'CIA', 'Cisco', 'Commodore', 'Conference', 'Cookiejacking', 'Cryptography', 'CSRF', 'Cuba', 'Cybercrime', 'Cyberwar', 'Data Loss', 'Database', 'DMCA', 'DNS', 'DoS', 'eBay', 'Egypt', 'Email', 'Ethiopia', 'Facebook', 'FBI', 'Finland', 'Firefox', 'Flaw', 'France', 'Fraud', 'Gamble', 'Germany', 'Google', 'Google Chrome', 'Government', 'Greece', 'Hacker', 'Headline', 'IBM', 'Identity Theft', 'India', 'Indonesia', 'Intel', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Japan', 'Java', 'Juniper', 'Kernel', 'Korea', 'Libya', 'Linux', 'Malaysia', 'Malware', 'McAfee', 'Mexico', 'Microsoft', 'Military', 'Motorola', 'Mozilla', 'MPAA', 'MySQL', 'Nasa', 'Netherlands', 'New Zealand', 'Nintendo', 'Nokia', 'Nortel', 'Norway', 'NSA', 'OpenBSD', 'Opera', 'Oracl

['0 Day', 'Adobe', 'Afghanistan', 'Africa', 'Algeria', 'Amazon', 'Anonymous', 'Apache', 'Apple', 'Argentina', 'Australia', 'Backdoor', '**Bank**', 'BlackBerry', 'Botnet', 'Brazil', 'Britain', '**BSD**', 'Canada', '**Car**', 'Caribbean', '**Censorship**', 'China', 'CIA', 'Cisco', 'Commodore', 'Conference', '**Cookiejacking**', 'Cryptography', '**CSRF**', 'Cuba', '**Cybercrime**', 'Cyberwar', '**Data Loss**', '**Database**', '**DMCA**', 'DNS', 'DoS', 'eBay', 'Egypt', '**Email**', 'Ethiopia', 'Facebook', '**FBI**', 'Finland', 'Firefox', 'Flaw', 'France', '**Fraud**', '**Gamble**', 'Germany', 'Google', 'Google Chrome', 'Government', 'Greece', '**Hacker**', 'Headline', 'IBM', '**Identity Theft**', 'India', 'Indonesia', 'Intel', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Japan', 'Java', 'Juniper', '**Kernel**', 'Korea', 'Libya', 'Linux', 'Malaysia', 'Malware', '**McAfee**', 'Mexico', 'Microsoft', 'Military', 'Motorola', 'Mozilla', '**MPAA**', 'MySQL', 'Nasa', 'Netherlands', 'New Zealand', 'Nintendo', 'Nokia', 'Nortel', 'Norway', '**NSA**', '**OpenBSD**', 'Opera', 'Oracle', 'Pakistan', 'Passport', '**Password**', 'Patch', 'PayPal', 'Philippines', 'Phish', 'Phone', '**Pirate**', 'Portugal', '**Privacy**', '**Religion**', '**RFID**', '**RIAA**', 'Romania', 'RSA', 'Russia', 'Safari', 'Samsung', 'Saudi Arabia', '**Scada**', '**Scam**', '**Science**', 'Scotland', 'Sega', 'Singapore', '**Site**', 'Skype', '**Social**', 'Sony', '**Space**', 'Spain', '**Spam**', 'Spyware', 'SSH', 'SSL', '**Survey**', 'Sweden', 'Switzerland', '**Symantec**', 'Syria', 'Taiwan', 'Terror', 'Thailand', 'Trojan', 'Turkey', 'Twitter', 'Uber', 'USA', 'Venezuela', '**VeriSign**', 'Vietnam', 'Virus', '**VoIP**', '**WebKit**', 'Wireless', 'WordPress', '**Worm**', 'XSS', 'Yahoo!', 'Yemen']

'hacker groups'-> Anonymous
'government' -> Government
corporation' -> Adobe, Amazon, Apple, Apache, BlackBerry, Cisco, Commodore, eBay, Facebook, Google, IBM, Intel, Juniper, Motorola, Mozilla, Nintendo, Nokia, Nortel, Oracle, PayPal, Samsung, Yahoo!, WordPress, Sega, Skype, Sony, Twitter, Uber
'unrelated' -> Car?, Headline
'darknet'
'cyber defense'
'hacking'
'security concepts'->Cryptography, RSA
'security products'
'network security'-> DNS, DoS, Botnet, SSH, SSL, Wireless
'cyberwar' -> Cyberwar, Terror
'geopolitical' -> Afghanistan, africa, algeria, cia, Argentina, Australia, Brazil, Britain, Canada, Caribbean, China, Cuba, Egypt, Ethiopia, Finland, Greece, Cyberwar, France, Germany, India, Indonesia, Iran, Iraq, Ireland, Israel, Italy, Japan, Korea, Libya, Malaysia, Mexico, NASA, Netherlands, New Zeland, Norway, Pakistan, Passport?, Philippines, Portugal, Romania, Russia, Saudi Arabia, Yemen, Scotland, Singapore, Spain, Sweden, Switzerland, Syria, Taiwan, Terror, Thailand, Turkey, USA, Venezuela, Vietnam
'data breach'
'vulnerability'-> 0 day, Flaw, XSS, Patch
'platform' ->Microsoft, Java, Linux, MySql, Phone, Safari, Google Chrome,Opera, Firefox
'cyber attack' -> Malware, Backdoor, Phish, Virus, Spyware, Trojan

'fraud', 'hacker groups', 'government', 'corporation',
       'unrelated', 'darknet', 'cyber defense', 'hacking', 'security concepts',
       'security products', 'network security', 'cyberwar', 'geopolitical',
       'data breach', 'vulnerability', 'platform', 'cyber attack'

In [None]:
new_concept = "cyber attack"
new_df = pd.DataFrame()
new_df["raw_text"] = df["raw_text"]

replace_dicts = [{"remove":["fraud"], "merge": ["fraud"]}, {"remove":["hacker groups"], "merge": ["hacker groups"]},
                 {"remove":["government"], "merge": ["government"]}, {"remove":["corporation"], "merge": ["corporation"]}, 
                 {"remove":["unrelated"], "merge": ["unrelated"]},
                 {"remove":["darknet"], "merge": ["darknet"]}, 
                 {"remove":["incident response", "cyber intelligence", "antivirus", "forensics", "pen testing", "firewall"], "merge": ["cyber defense"]},
                 {"remove":["hacking", "white hat", "black hat","hacker groups"], "merge": ["hacking"]},
                 {"remove":["stenography", "cryptography"], "merge": ["security concepts"]},
                 {"remove":["cloud security", "firewall", "antivirus"], "merge": ["security products"]},
                 {"remove":["botnet", "IoT", "DDOS"], "merge": ["network security"]},
                 {"remove":["cyberwar"], "merge": ["cyberwar"]},
                 {"remove":["geopolitical", "cyberwar", "usa", "russia", "ukraine", "cyberterrorism"], "merge": ["geopolitical"]},
                 {"remove":["data breach", "security breach"], "merge": ["data breach"]},
                 {"remove":["vulnerability", "cve", "XSS", "patch"], "merge": ["vulnerability"]},
                 {"remove":["mobile", "IoT", "OS"], "merge": ["platform"]},
                 {"remove":["cyber attack", "malware", "virus", "DDOS", "botnet", "hacking", "phishing",
                            "adware", "rootkit", "backdoor", "keylog", "trojan", "ransomware", "spyware"], "merge": ["cyber attack"]},
                 ]

def replace_cats(row, entry):
  for cat in entry["remove"]:
    if row[cat] == 1:
      row[new_concept] = row[cat]
      return row
  row[new_concept] = 0
  return row

for entry in replace_dicts:
  new_concept = entry["merge"][0]
  new_df[new_concept] = df.apply(replace_cats, args=[entry], axis=1)[new_concept]

## Naive Bayes

In [None]:
import json
import pandas as pd

In [None]:
with open("theRegister.json", "w") as f: #if exists, open; else, create

df = pd.DataFrame(columns = ["news", "category"])

for i in range(1):
  if(i == 0):
    arr = scrapeTheRegister("")
  else:
    arr = scrapeTheRegister("earlier/" + str(i + 1) + "/")
  for i in range(len(arr)):
    df = df.append({"news": arr[i]["bow"], 'category': arr[i]['category']}, ignore_index = True)

# df["category"].replace(categoryMap, inplace = True)

print(df)

# uploadFile("theRegister.json")

IndentationError: ignored

In [None]:
# with open("bleepingComputer.json", "w") as f: #if exists, open; else, create
# df = pd.DataFrame(columns = ["news", "category"])

# for i in range(100):
#   if(i == 0):
#     arr = scrapeBleepingComputer("")
#   else:
#     arr = scrapeBleepingComputer("page/" + str(i + 1) + "/")
#   for i in range(len(arr)):
#     df = df.append({"news": arr[i]["bow"], "category": arr[i]["category"]}, ignore_index = True)

# df["category"].replace(categoryMap, inplace = True)

# print(df)

# uploadFile("bleepingComputer.json")

In [None]:
# with open("securityAffairs.json", "w") as f: #if exists, open; else, create

# df = pd.DataFrame(columns = ["news", "category"])

# for i in range(100):
#   if(i == 0):
#     arr = scrapeSecurityAffairs("")
#   else:
#     arr = scrapeSecurityAffairs("page/" + str(i + 1) + "/")
#   for i in range(len(arr)):
#     df = df.append({"news": arr[i]["bow"], "category": arr[i]["category"]}, ignore_index = True)

# df["category"].replace(categoryMap, inplace = True)

# print(df)

# uploadFile("securityAffairs.json")

In [None]:
X_train, X_test, y_train, y_test= train_test_split(df["news"], df["category"], test_size=0.25, random_state=42)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

# Create a class for converting sparse matrix output of TfidfVectorizer to dense matrix for feeding into GaussianNB
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()


# Initiate the pipeline with required components.You can use Pipeline class of sklearn -> https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
# There will be three components; 1) TfidfVectorizer 2) DenseTransformer 3) Naive Bayes classifier.

pipeline=Pipeline([('tfidf',TfidfVectorizer()),('dense_transformer',DenseTransformer()),("gnb",GaussianNB())]) 


# Set the hyperparameter space that will be scanned with GridSearchCV.

grid_params = {
  "tfidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
  "tfidf__min_df": [10, 15, 40],
  # "tfidf__max_df": []
}

In [None]:
# Initialize and run the GridSearchCV to scan the hyperparameter and find the best hyperparameter set that will maximize the scoring option for binary classification.

cv_nb = GridSearchCV(pipeline, grid_params, scoring='f1_macro', cv =5, verbose = 1)
cv_nb.fit(X_train, y_train)


# Report the standart deviation of split scores for each hyperparameter group.

#printing min, max, mean and std scores of parameter groups
splits = [cv_nb.cv_results_['split0_test_score'], cv_nb.cv_results_['split1_test_score'], cv_nb.cv_results_['split2_test_score'], cv_nb.cv_results_['split3_test_score'], cv_nb.cv_results_['split4_test_score']]
avg_scores = []
for split in splits:
  avg_scores.append(np.mean(split))
min_score = min(avg_scores)
min_index = avg_scores.index(min_score) 
max_score = max(avg_scores)
max_index = avg_scores.index(max_score)
print("Standard Deviations : ", cv_nb.cv_results_['std_test_score'])
print("Mean Scores : ", cv_nb.cv_results_['mean_test_score'])
print("Min Score : ", splits[min_index], "Split index of min: ", min_index)
print("Max Score : ", splits[max_index], "Split index of min: ", max_index)

# Show the best parameter set for given dataset and hyperparameter space.

print("Best Parameters: ", cv_nb.best_params_)
# Building the pipeline with the best parameter group and reporting Conf. Mat. and Results on the Test Set #
# Create your Pipeline object with the best parameter set.

best_pipeline=Pipeline([('tfidf',TfidfVectorizer()),('dense_transformer',DenseTransformer()),("gnb",GaussianNB())]) 
best_pipeline.set_params(**cv_nb.best_params_)

# Fit your pipeline on training set.
best_pipeline.fit(X_train, y_train)

# Take prediction and report the F1 and Accuracy scores for binary classification. Then show the confussion table.
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
pred = best_pipeline.predict(X_test)
print(classification_report(y_test, pred)) #f1 and accuracy scores report
title = "News Categories Confusion Table"
disp = plot_confusion_matrix(best_pipeline, X_test, y_test, cmap=plt.cm.Blues,normalize="true")
disp.ax_.set_title(title)
plt.show()