In [1]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
import string

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords_and_punctuation(words):
    stop_words = set(stopwords.words('english'))
    punct_table = str.maketrans('', '', string.punctuation)
    filtered_words = [word.lower().translate(punct_table) for word in words if word.lower().translate(punct_table) not in stop_words and word.lower().translate(punct_table) != '']
    return filtered_words

def preprocess_sentences(input_file, output_file):
    updated_sentences = []
    with open(input_file, 'r', encoding='utf-8') as file:
        for line in file:
            updated_sentences.append(line.strip())

    tokenized_sentences = []
    for sentence in updated_sentences:
        words = word_tokenize(sentence)
        filtered_words = remove_stopwords_and_punctuation(words)  # 使用更新后的函数
        tokenized_sentences.append(filtered_words)

    with open(output_file, 'w', encoding='utf-8') as file:
        for sentence in tokenized_sentences:
            file.write(' '.join(sentence) + '\n')

def train_word2vec_model(sentences_file, model_save_path):
    # Load the updated sentences as a LineSentence object
    updated_sentences = LineSentence(sentences_file)

    # Training the Word2Vec model on the updated sentences
    updated_model = Word2Vec(sentences=updated_sentences, vector_size=100, window=5, min_count=3, workers=4)

    # Save the updated model
    updated_model.save(model_save_path)

    return model_save_path

# Example usage:
input_file = "./cybersecurity_sentences.txt"
output_file = "./preprocessed_cybersecurity_sentences.txt"
model_save_path = "./cybersecurity_word2vec.model"

# Preprocess sentences and save to output file
preprocess_sentences(input_file, output_file)

# Train Word2Vec model and save
trained_model_path = train_word2vec_model(output_file, model_save_path)

print("Word2Vec model trained and saved at:", trained_model_path)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/p76111262/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/p76111262/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Word2Vec model trained and saved at: ./cybersecurity_word2vec.model


In [4]:
from gensim.models import Word2Vec

# 指定模型的加载路径
model_path = "./cybersecurity_word2vec.model"
model = Word2Vec.load(model_path)

In [7]:
def remove_punctuation_and_lower(text):
  punct_translation_table = str.maketrans('', '', string.punctuation)
  return text.translate(punct_translation_table).lower()

word_list = ["DDoS_HOIC", "DDoS_LOIC-HTTP", "DDoS_LOIC-UDP", "DoS_GoldenEye", "DoS_Hulk", "DoS_SlowHTTPTest", "DoS_Slowloris", "BruteForce-Web", "BruteForce-SSH", "BruteForce-FTP", "BruteForce-XSS", "SQL-Injection", "Infiltration", "Botnet"]
for word in word_list:
    word = remove_punctuation_and_lower(word)
    print(word, "similar:", model.wv.most_similar(word))

ddoshoic similar: [('http', 0.3622981309890747), ('using', 0.3541862368583679), ('force', 0.3404111862182617), ('attack', 0.3297610282897949), ('tool', 0.29176801443099976), ('service', 0.27642467617988586), ('malware', 0.27457085251808167), ('examples', 0.27328285574913025), ('vast', 0.2610372304916382), ('internet', 0.2584122121334076)]
ddosloichttp similar: [('resources', 0.30744606256484985), ('sending', 0.30023953318595886), ('attacks', 0.2975677251815796), ('discovered', 0.2953900992870331), ('security', 0.28720736503601074), ('concurrent', 0.2740911841392517), ('doshulk', 0.27153289318084717), ('internet', 0.2570054233074188), ('specifically', 0.2528572082519531), ('often', 0.24815917015075684)]
ddosloicudp similar: [('websites', 0.2498084306716919), ('sqli', 0.232588529586792), ('volume', 0.22023993730545044), ('different', 0.21911396086215973), ('succession', 0.19887429475784302), ('measures', 0.19685114920139313), ('overwhelm', 0.19487984478473663), ('encrypted', 0.1859719753