In [37]:
import nltk
import contractions
import inflect
# from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import re, string
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset

nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /Users/cankrmn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cankrmn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading the Data

In [33]:
df = pd.read_csv("./reduced_dataset[0,1000].csv")

In [35]:
train, test= train_test_split(df, test_size=0.15, random_state=42)
train, validation= train_test_split(train, test_size=0.20, random_state=42)

train.to_csv('train_data.csv', index=False)
validation.to_csv('validation_data.csv', index=False)
test.to_csv('test_data.csv', index=False)

In [None]:
dataset = load_dataset('csv', data_files={'train': ['train_data.csv'], 'test': 'test_data.csv', 'validation' : 'validation_data.csv'})

In [39]:
example = dataset['train'][1]
example

{'Unnamed: 0': 284,
 'raw_text': 'Operation Diànxùn: Cyberespionage Campaign Targeting Telecommunication Companies\nhttps://www.mcafee.com/blogs/other-blogs/mcafee-labs/operation-dianxun-cyberespionage-campaign-targeting-telecommunication-companies/',
 'fraud': 0,
 'hacker groups': 0,
 'government': 0,
 'corporation': 0,
 'unrelated': 0,
 'darknet': 0,
 'cyber defense': 0,
 'hacking': 0,
 'security concepts': 0,
 'security products': 0,
 'network security': 0,
 'cyberwar': 0,
 'geopolitical': 1,
 'data breach': 0,
 'vulnerability': 0,
 'platform': 0,
 'cyber attack': 1}

## Preprocessing

In [19]:
def url_remover(data): # remove any url in text
  return re.sub(r'https?\S+','',data)

def web_associated(data):
  text = url_remover(text)
  return text

# -------------------------------

def remove_round_brackets(data): # remove anything between two round brackets
   return re.sub('\(.*?\)','',data)

punctList = string.punctuation + '“”' 
def remove_punc(data): # remove any punctuation
  trans = str.maketrans('','', punctList)
  return data.translate(trans)

def white_space(data): # remove any double or more space
  return ' '.join(data.split())

def complete_noise(data):
  new_data = remove_round_brackets(data)
  new_data = remove_punc(new_data)
  new_data = white_space(new_data)
  return new_data

# -------------------------------
def text_lower(data): # make every letter lowercase
  return data.lower()

def contraction_replace(data): # fix contractions (e.g. won't => will not)
  return contractions.fix(data)

def number_to_text(data): # write numbers as text and return (...12... => ...twelve...)
  temp_str = data.split()
  string = ""
  for i in temp_str:
    if i.isdigit(): # if the word is digit, converted to 
      temp = inflect.engine().number_to_words(i)
      string += temp + " "
    else:
      string += i + " "
  return string.strip()

def normalization(data):
  text = text_lower(data)
  text = number_to_text(text)
  text = contraction_replace(text)
  tokens = nltk.word_tokenize(text)
  return tokens

# -------------------------------

def stopword(data): # remove stopwords
  clean = []
  for i in data:
    if i not in stopwords.words('english'):
      clean.append(i)
  return clean

def stemming(data): # stem the text
  stemmer = LancasterStemmer()
  stemmed = []
  for i in data:
    stem = stemmer.stem(i)
    stemmed.append(stem)
  return stemmed

def lemmatization(data): # lemmatize the text
  lemma = WordNetLemmatizer()
  lemmas = []
  for i in data:
    lem = lemma.lemmatize(i, pos='v')
    lemmas.append(lem)
  return lemmas  

def final_process(data):
  stopwords_remove = stopword(data)
  stemmed = stemming(stopwords_remove)
  lemm = lemmatization(stopwords_remove)
  return stemmed, lemm

# -------------------------------

def preprocess(data): # run all preprocessing functions
  txt = url_remover(data)
  txt = complete_noise(txt)
  txt = normalization(txt)
  stemmed, lemm = final_process(txt)
  return stemmed, lemm

In [20]:
txt = '"Securing 213 Data With a Frenzied Remote Workforce-Podcast https://threatpost.com/securing-data-frenzied-remote-workforce-podcast/178742/ Stock the liquor cabinet and take a shot whenever you hear GitLab Staff Security Researcher Mark Loveless say “Zero Trust.”"'

stemmed, lemm = preprocess(txt)

print(stemmed, "\n\n", lemm)

['sec', 'two', 'hundr', 'thirteen', 'dat', 'frenzy', 'remot', 'workforcepodcast', 'stock', 'liqu', 'cabinet', 'tak', 'shot', 'whenev', 'hear', 'gitlab', 'staff', 'sec', 'research', 'mark', 'loveless', 'say', 'zero', 'trust'] 

 ['secure', 'two', 'hundred', 'thirteen', 'data', 'frenzied', 'remote', 'workforcepodcast', 'stock', 'liquor', 'cabinet', 'take', 'shoot', 'whenever', 'hear', 'gitlab', 'staff', 'security', 'researcher', 'mark', 'loveless', 'say', 'zero', 'trust']


In [23]:
labels = ['fraud', 'hacker groups', 'government', 'corporation',
       'unrelated', 'darknet', 'cyber defense', 'hacking', 'security concepts',
       'security products', 'network security', 'cyberwar', 'geopolitical',
       'data breach', 'vulnerability', 'platform', 'cyber attack']

id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}