In [4]:
import pandas as pd
import os

In [5]:
files = [name if name.endswith('.csv') else '' for name in os.listdir('../data')]

In [6]:
files

['ransomware_detect.csv',
 'application_control.csv',
 'SIEM.csv',
 'system_tweak.csv',
 'EDR.csv',
 'MDR.csv',
 'anti-counterfeit.csv',
 'NGFW.csv',
 'unwanted_programs.csv',
 'atm_security.csv',
 'SOAR.csv',
 'CASB.csv',
 'spyware_detect.csv',
 'industrial_iot_security.csv',
 'malware_detect.csv',
 'honeypot.csv',
 'NDR.csv',
 'sandbox.csv',
 'parental_control.csv',
 'SASE.csv']

In [7]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from tqdm import tqdm

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.max_length = 1500000
def clean_text(df):
    # download list of countries
    with open('countries.txt', 'r') as f:
        init_countries = f.readlines()
        countries = [c.lower()[:-1] for c in init_countries]
    all_reviews = list()
    lines = df["summary"].values.tolist()
    for text in tqdm(lines):
        text = text.lower()
        text = re.sub(r"\xa0", " ", text)
        
        # remove links
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        text = re.sub(r"www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text)
        text = re.sub(r"[\w\d_\-]+[\w\d_\-]+\.[\w\d]{2,}", "", text)
        
        # dynamic reg_exp to remove country names
        for country in countries:
            pattern = re.compile(country)
            text = pattern.sub('COUNTRYNAME', text)

        # remove nums concatenated with texts
        text = re.sub(r"(?<=[a-zA-Z])(\d+\.*)+", "", text)

        # replace num years to YEAR
        text = re.sub(r"[1|2]\d{3}", "YEARNUM", text)

        # remove nums with dot
        text = re.sub(r"(\d+\.*)+", "", text)

        # remove email
        text = re.sub(r"[\w\d_\-]+@[\w\d_\-]+\.[\w\d]{2,}", "", text)

        # remove punctuations
        text = re.sub(r"[,.\"'!@#$%^&*(){}?/;`~:<>+=\|•_]", " ", text)
        text = re.sub(r"\s+", " ", text)
        text = re.sub(r" - ", " ", text)
        text = re.sub(r"-{2,}", " ", text)
        
        text = re.sub(r"-", "dashh", text)
        
        # remove non-ASCII characters
        text = re.sub(r"[\w\d]*[^\x00-\x7F]+[\w\d]*", '', text)
        
        text = re.sub(r"\s+", ' ', text)
        
        text = text.strip()
        
        tokens = word_tokenize(text)
        
        # leave only words
        words = [word for word in tokens if word.isalpha()]
        
        words = [re.sub(r"dashh", '-', word) for word in tokens]
        
        stop_words = set(stopwords.words("english"))
        
        words = [w for w in words if not w in stop_words]
        words = ' '.join(words)
        
        doc = nlp(words)
        res = " ".join([token.lemma_ for token in doc])
        res = re.sub(r" - ", '-', res)
        all_reviews.append(res)
    return all_reviews

In [13]:
import statistics
def get_len_text(text):
    return len(text.split())

stats = {
    'name': [],
    'num_exp': [],
    'lens': [],
    'min_len': [],
    'max_len': [],
    'med_len': []
}

for file_name in files:
    #print(file_name)
    if file_name:
        class_name = re.sub(r"\.csv", '', file_name)
        df = pd.read_csv('data/' + file_name)
        df = df.dropna(subset=['summary'])
        df['len_text'] = df['summary'].apply(get_len_text)
        stats['name'].append(class_name)
        stats['num_exp'].append(df.shape[0])
        stats['lens'].append(df['len_text'].values.tolist())
        stats['min_len'].append(min(df['len_text'].values.tolist()))
        stats['max_len'].append(max(df['len_text'].values.tolist()))
        stats['med_len'].append(statistics.median(df['len_text'].values.tolist()))

In [14]:
# statistics for datasets
stats_df = pd.DataFrame.from_dict(stats)
stats_df

Unnamed: 0,name,num_exp,lens,min_len,max_len,med_len
0,ransomware_detect,58998,"[686, 466, 543, 474, 550, 384, 561, 543, 359, ...",11,9843,714.0
1,application_control,5231,"[817, 91, 136, 174, 251, 868, 297, 381, 423, 8...",18,18530,886.0
2,SIEM,6900,"[104, 676, 979, 728, 728, 699, 728, 886, 728, ...",2,24093,818.5
3,system_tweak,3775,"[525, 2567, 1582, 559, 842, 614, 411, 1244, 17...",20,212343,1193.0
4,EDR,11670,"[122, 1871, 1631, 881, 1081, 93, 1273, 761, 75...",5,8077,785.0
5,MDR,10537,"[548, 2245, 941, 734, 434, 873, 1044, 672, 610...",16,37648,650.0
6,anti-counterfeit,32343,"[2070, 588, 128, 883, 946, 1095, 745, 622, 458...",1,46664,790.0
7,NGFW,2278,"[95, 672, 796, 2065, 385, 975, 733, 1348, 806,...",5,5101,749.0
8,unwanted_programs,82503,"[980, 685, 291, 274, 317, 360, 305, 883, 316, ...",1,121516,543.0
9,atm_security,4488,"[912, 271, 181, 1512, 805, 2236, 679, 383, 896...",10,212343,812.0


In [15]:
min_max_len = min(stats_df['max_len'].values.tolist())
min_max_len

5101

In [16]:
# get min value
min_val = min(stats_df['num_exp'].values.tolist())
min_val

2278

In [17]:
def get_len_text(text):
    return len(text.split())
df['len_text'] = df['summary'].apply(get_len_text)

In [22]:
stats = {
    'name': [],
    'num_exp': []
}
for file_name in files:
    print(file_name)
    if file_name:
        class_name = re.sub(r"\.csv", '', file_name)
        df = pd.read_csv('data/' + file_name)
        df = df.dropna(subset=['summary'])
        df['len_text'] = df['summary'].apply(get_len_text)
        df = df[df['len_text'].apply(lambda x: x <= min_max_len and x > 100)]
        if df.shape[0] > min_val:
            df = df.sample(min_val)
        pro_summary = clean_text(df)
        new_df = pd.DataFrame()
        new_df['init_text'] = df["summary"].values.tolist()
        new_df['text'] = pro_summary
        new_df['label'] = [class_name]*len(pro_summary)
        new_df.to_csv('data_processed/' + file_name, index=False)
        
        stats['name'].append(class_name)
        stats['num_exp'].append(len(pro_summary))

ransomware_detect.csv


100% 2278/2278 [01:17<00:00, 29.31it/s]


application_control.csv


100% 2278/2278 [01:20<00:00, 28.29it/s]


SIEM.csv


100% 2278/2278 [01:30<00:00, 25.27it/s]


system_tweak.csv


100% 2278/2278 [02:05<00:00, 18.14it/s]


EDR.csv


100% 2278/2278 [01:15<00:00, 30.17it/s]


MDR.csv


100% 2278/2278 [01:15<00:00, 30.24it/s]


anti-counterfeit.csv


100% 2278/2278 [01:29<00:00, 25.32it/s]


NGFW.csv


100% 2202/2202 [01:17<00:00, 28.31it/s]


unwanted_programs.csv


100% 2278/2278 [01:02<00:00, 36.64it/s]


atm_security.csv


100% 2278/2278 [01:26<00:00, 26.21it/s]


SOAR.csv


100% 2278/2278 [01:13<00:00, 31.10it/s]


CASB.csv


100% 2278/2278 [01:17<00:00, 29.43it/s]


spyware_detect.csv


100% 2278/2278 [01:19<00:00, 28.77it/s]


industrial_iot_security.csv


100% 2278/2278 [01:33<00:00, 24.38it/s]


malware_detect.csv


100% 2278/2278 [01:12<00:00, 31.22it/s]


honeypot.csv


100% 2278/2278 [01:34<00:00, 24.17it/s]
  0% 0/2278 [00:00<?, ?it/s]

NDR.csv


100% 2278/2278 [01:08<00:00, 33.17it/s]


sandbox.csv


100% 2278/2278 [01:41<00:00, 22.37it/s]


parental_control.csv


100% 2278/2278 [01:19<00:00, 28.52it/s]


SASE.csv


100% 2278/2278 [01:19<00:00, 28.56it/s]


In [None]:
# generate train val test data

In [23]:
whole_df = pd.DataFrame()
for file_name in files:
    if file_name:
        class_name = re.sub(r"\.csv", '', file_name)
        df = pd.read_csv('../data_processed/' + file_name)
        whole_df = pd.concat([whole_df, df], ignore_index=True)

In [24]:
whole_df.head()

Unnamed: 0,init_text,text,label
0,"This advertisement has not loaded yet, but you...",advertisement load yet article continue small ...,ransomware_detect
1,Trend Micro research reveals visibility challe...,trend micro research reveal visibility challen...,ransomware_detect
2,The introduction of SaaS Security Posture Mana...,introduction saas security posture management ...,ransomware_detect
3,New report identifies financial savings and in...,new report identify financial saving increase ...,ransomware_detect
4,"For 21 years, the software company Kaseya labo...",year software company kaseya labor relative ob...,ransomware_detect


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test_u, y_train, y_test_u = train_test_split(whole_df['text'].values.tolist(), whole_df['label'].values.tolist(), test_size=0.3, random_state=42)

In [27]:
X_test, X_val, y_test, y_val = train_test_split(X_test_u, y_test_u, test_size=0.33, random_state=42)

In [28]:
X_train_init, X_test_u_init, y_train_init, y_test_u_init = train_test_split(whole_df['init_text'].values.tolist(), whole_df['label'].values.tolist(), test_size=0.3, random_state=42)

In [29]:
X_test_init, X_val_init, y_test_init, y_val_init = train_test_split(X_test_u_init, y_test_u_init, test_size=0.33, random_state=42)

In [30]:
train_df_to_save = pd.DataFrame()
train_df_to_save['init_text'] = X_train_init
train_df_to_save['text'] = X_train
train_df_to_save['label'] = y_train
train_df_to_save.to_csv('../data_processed/model_data/train_data.csv', index=False)

In [32]:
val_df_to_save = pd.DataFrame()
val_df_to_save['init_text'] = X_val_init
val_df_to_save['text'] = X_val
val_df_to_save['label'] = y_val
val_df_to_save.to_csv('../data_processed/model_data/val_data.csv', index=False)

In [33]:
test_df_to_save = pd.DataFrame()
test_df_to_save['init_text'] = X_test_init
test_df_to_save['text'] = X_test
test_df_to_save['label'] = y_test
test_df_to_save.to_csv('../data_processed/model_data/test_data.csv', index=False)