# Optimising the number of topics

We needed to find an optimal number of topics that we can use across all our models. 

In [12]:
import gensim
import pandas as pd
import numpy as np
from gensim.models import ldaseqmodel
from ast import literal_eval
from sklearn.metrics import jaccard_score
import statistics

In [3]:
df = pd.read_csv("../data/processed/formatted_df.csv").drop(columns = ['Unnamed: 0'])
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Name,Status,Description,References,Phase,Votes,Comments
0,CVE-1999-0001,Candidate,"['ipinputc', 'bsdderived', 'tcpip', 'implement...",BUGTRAQ:19981223 Re: CERT Advisory CA-98.13 - ...,Modified (20051217),"MODIFY(1) Frech | NOOP(2) Northcutt, W...",Christey> A Bugtraq posting indicates that the...
1,CVE-1999-0002,Entry,"['buffer', 'overflow', 'nfs', 'mountd', 'give'...",BID:121 | URL:http://www.securityfocus.com...,,,
2,CVE-1999-0003,Entry,"['execute', 'command', 'root', 'buffer', 'over...",BID:122 | URL:http://www.securityfocus.com...,,,
3,CVE-1999-0004,Candidate,"['mime', 'buffer', 'overflow', 'email', 'clien...",CERT:CA-98.10.mime_buffer_overflows | MS:M...,Modified (19990621),"ACCEPT(8) Baker, Cole, Collins, Dik, Landfi...","Frech> Extremely minor, but I believe e-mail i..."
4,CVE-1999-0005,Entry,"['arbitrary', 'command', 'execution', 'imap', ...",BID:130 | URL:http://www.securityfocus.com...,,,
...,...,...,...,...,...,...,...
166896,CVE-2021-46482,Candidate,"['jsish', 'v', 'discover', 'contain', 'heap', ...",MISC:https://github.com/pcmacdon/jsish/issues/66,Assigned (20220124),None (candidate not yet proposed),
166897,CVE-2021-46483,Candidate,"['jsish', 'v', 'discover', 'contain', 'heap', ...",MISC:https://github.com/pcmacdon/jsish/issues/62,Assigned (20220124),None (candidate not yet proposed),
166898,CVE-2021-46559,Candidate,"['firmware', 'moxa', 'tn', 'device', 'weak', '...",MISC:https://www.moxa.com/en/support/product-s...,Assigned (20220126),None (candidate not yet proposed),
166899,CVE-2021-46560,Candidate,"['firmware', 'moxa', 'tn', 'device', 'allow', ...",MISC:https://www.moxa.com/en/support/product-s...,Assigned (20220126),None (candidate not yet proposed),


We found that the way that we saved the data frame, meant that the Description column was read as a string rather than a list as it was intended. Therefore, we had to apply the function literal_eval which allows us to convert the string of a stored list into a python list. We then separate the description column into a list to allow easier access. 

In [4]:
df['Description'] = df['Description'].apply(literal_eval)

In [5]:
desc = df['Description']

To separate the data into years easily we find the boundary indices for each year. We will use that the year of the vulnerability is the 4-8 characters of each of the CVE names. Therefore, we will extract these and count the instances for each one and add it to the last boundary index.

In [6]:
names = df['Name']
year = []
for instance in names:
    year.append(int(instance[4:8]))
year_count = [0]
for i in range(23):
    if i == 0:
        year_count.append(year.count(i+1999))
    else:
        year_count.append(year.count(i+1999) + year_count[i]) 
print(year_count)

[0, 1541, 2778, 4313, 6663, 8161, 10794, 15380, 22238, 28578, 35549, 40436, 45428, 50015, 55416, 61536, 69815, 77731, 86931, 101250, 116731, 132000, 149784, 166901]


Here we create a dictionary of words that occur in the whole data set, allowing us to index each of these words. We also format the documents of each year into a matrix which indicates how many times each word occurs in each document. 

In [7]:
vocab = gensim.corpora.Dictionary(desc)
doc_word_matrix_array = []
for i in range(23):
    doc_word_matrix_array.append([vocab.doc2bow(doc) for doc in desc[year_count[i]:year_count[i+1]]])

In [8]:
LDA = gensim.models.ldamodel.LdaModel

### Coherence and Perplexity

We then wanted to find how many topics would be optimal for the LDA model. We thought we would try the range between 5 and 30 topics as we felt that it would provide enough topics to look at, while ensuring that analysis would be relatively easy. However after running the code for a while, it became obvious that coherence will strongly prefer few topics and so would not be a good metric for this. We used the mean to combine the coherence of the models for each year to allow us an easy way to compare the different sets of models using a simple metric.

In [None]:
coherence=-1000 #to initialise the coherence
for topics in range(5,30):
    ldamodels=[]
    ldacoherence=[]
    for i in range(23): #for each year create a model and test the coherence of that model on the data
        ldamodels.append(LDA(corpus=doc_word_matrix_array[i], id2word=vocab, num_topics=topics)) 
        ldacoherence.append(CoherenceModel(model=ldamodels[i], corpus=doc_word_matrix_array[i], dictionary=vocab, coherence='u_mass').get_coherence())
    temp_coherence = statistics.mean(ldacoherence)
    #if the new model works better than previous ones then save its parameters
    if temp_coherence > coherence:
        coherence = temp_coherence
        models = ldamodels
    print(coherence)

We then tried to use perplexity instead. However, we came to a similar problem that perplexity strongly prefers models with a large number of topics.

In [None]:
perplexity=1000 #to initialise the perplexity
for topics in range(5,30):
    ldamodels=[]
    ldaperplexity=[]
    for i in range(23): #for each year create a model and test the perplexity of that model on the data
        ldamodels.append(LDA(corpus=doc_word_matrix_array[i], id2word=vocab, num_topics=topics))
        ldaperplexity.append(ldamodels[i].log_perplexity(doc_word_matrix_array[i]))
    temp_perplexity = statistics.mean(ldaperplexity)
    #if the new model works better than previous ones then save its parameters
    if temp_perplexity < perplexity:
        perplexity = temp_perplexity
        models = ldamodels
    print(perplexity)

### Jaccard Score

The reason that we having issues with perplexity and coherence is because it doesn't take into consideration the number of words in each topic, which would give a balance to their scoring. Therefore, we thought that the Jaccard similarity would act as a good way of being able to balance this. 

In order calculate the jaccard score for the collection of topics we decided that we would sum the scores of each topic compared with every other topic. 

However, we found this did not work as by using a sum, it will increase as topics increase and so will not be a fair evaluation of the jaccard scores. 

In [None]:
#used to extract the top 50 words in each topic and return a list of topics containing the list of 50 words
def extract_Words(model,num_topics): 
    topics=[]
    for words in model.show_topics(num_topics = num_topics, num_words = 50, formatted=False):
        (a,b) = words #removes the index
        word=[]
        for j in b:
            (c,d) = j #extracts the word from the (word, probability) tuple
            word.append(c)
        topics.append(word)
    return topics

#returns the 'jaccard score' of the set of topics
def get_Jaccard(topics,num_topics):
    score = 0
    for i in range(len(topics)):
        for j in range(len(topics)):
            score = score + jaccard_score(topics[i],topics[j],average='macro')
    return score

jaccard=100
for topics in range(5,30):
    ldamodels=[]
    ldajaccard=[]
    for i in range(23):
        ldamodels.append(LDA(corpus=doc_word_matrix_array[i], id2word=vocab, num_topics=topics))
        ldajaccard.append(get_Jaccard(extract_Words(ldamodels[i],topics),topics))
    temp_jaccard = statistics.mean(ldajaccard)
    #if the new model works better than previous ones then save its parameters
    if temp_jaccard < jaccard:
        jaccard = temp_jaccard
        models = ldamodels
    print(jaccard)

After realising the sum did not work, we decided that multiplying would provide a better way of combining the jaccard scores of the topics. We also thought we would inverse the score as number close to 0 have more impact on the product so by inverting the score, we are highlighting when there is a large intersection between topics. We also accounted for the case where topics are equal by assigning any such values as non-0, while being much lower than any other possible value. 

However, after running this the model with 5 topics was optimal and we believe that this is much more likely due to the biased metric rather than 5 actually being the optimal number. In this case we believe that the metric is inhibited by the inability to be able to look at the whole set of words in each topic, and must instead choose a fixed value for each. This means that the number of words in each topic appears to be 50 for each, which is not the case and undermines the weighting that we hoped the jaccard score would provide. 

In [13]:
#used to extract the top 50 words in each topic and return a list of topics containing the list of 50 words
def extract_Words(model,num_topics):
    topics=[]
    for words in model.show_topics(num_topics = num_topics, num_words = 50,formatted=False):
        (a,b) = words #removes the index
        word=[]
        for j in b:
            (c,d) = j #extracts the word from the (word, probability) tuple
            word.append(c)
        topics.append(word)
    return topics

#returns the 'jaccard score' of the set of topics
def get_inverse_Jaccard(topics,num_topics):
    score = 1
    for i in range(len(topics)):
        for j in range(len(topics)):
            if i != j: #only compares different topics
                s = 1 - jaccard_score(topics[i],topics[j],average='macro') # to highlight when there is lots of intersection
                if s == 0:
                    s = 0.0001
                score = score * s
    return score

jaccard=0
for topics in range(5,30):
    ldamodels=[]
    ldajaccard=[]
    for i in range(23):
        ldamodels.append(LDA(corpus=doc_word_matrix_array[i], id2word=vocab, num_topics=topics))
        ldajaccard.append(get_inverse_Jaccard(extract_Words(ldamodels[i],topics),topics))
    temp_jaccard = statistics.mean(ldajaccard)
    #if the new model works better than previous ones then save its parameters
    if temp_jaccard > jaccard:
        jaccard = temp_jaccard
        models = ldamodels
    print(jaccard)

0.7570220823758371
0.7570220823758371
0.7570220823758371
0.7570220823758371
0.7570220823758371
0.7570220823758371
0.7570220823758371
0.7570220823758371
0.7570220823758371
0.7570220823758371


KeyboardInterrupt: 

Here we could have created our own metric that would have looked at either combining a jaccard similarity with the probabilities associated with each word or combining perplexity / coherence with a gauge for the size of the topics. However, this was time constrained. 