**Mounting the google drive to colab session**

In [None]:
#Mounting gdrive to gain access to the dataset
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Installing the visualization module**

In [None]:
!pip install pyLDAvis

**Importing the necessary modules**

In [None]:
#Importing necessary libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
import nltk
nltk.download('wordnet')
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
pip install pandas --upgrade

**Importing data**

In [None]:
#importing data
data = pd.read_json('/content/gdrive/My Drive/cran_d.json')
data

Unnamed: 0,id,author,bibliography,body,title
0,1,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...
1,2,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...
2,3,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...
3,4,"yen,k.t.","j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...,approximate solutions of the incompressible la...
4,5,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...
...,...,...,...,...,...
1395,1396,"cook,i.t. and rockey,k.c.","aero. quart. 13, 1962, 41.",shear buckling of clamped and simply-supported...,shear buckling of clamped and simply-supported...
1396,1397,"stein,m. and fralich,r.w.","naca tn.1851, 1949.",critical shear stress of an infinitely long si...,critical shear stress of an infinitely long si...
1397,1398,"way,s.","j. app. mech. 3, 1936, a131.",stability of rectangular plates under shear an...,stability of rectangular plates under shear an...
1398,1399,"wang,t.k.","j.app.mech. 3,1947, a269.",buckling of transverse stiffened plates under ...,buckling of transverse stiffened plates under ...


**Cleaning the data by removing unnecessary columns**

In [None]:
data.drop(['author', 'bibliography','title'], axis = 1)

Unnamed: 0,id,body
0,1,experimental investigation of the aerodynamics...
1,2,simple shear flow past a flat plate in an inco...
2,3,the boundary layer in simple shear flow past a...
3,4,approximate solutions of the incompressible la...
4,5,one-dimensional transient heat conduction into...
...,...,...
1395,1396,shear buckling of clamped and simply-supported...
1396,1397,critical shear stress of an infinitely long si...
1397,1398,stability of rectangular plates under shear an...
1398,1399,buckling of transverse stiffened plates under ...


**Data Preprocessing (Stemming, Lemmatizing and Tokenizing)**

In [None]:
#Data Preprocessing
stemmer = SnowballStemmer(language='english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
processed_data = data['body'].map(preprocess)
processed_data

0       [experiment, investig, aerodynam, wing, slipst...
1       [simpl, shear, flow, past, flat, plate, incomp...
2       [boundari, layer, simpl, shear, flow, past, fl...
3       [approxim, solut, incompress, laminar, boundar...
4       [dimension, transient, heat, conduct, doubl, l...
                              ...                        
1395    [shear, buckl, clamp, simpli, support, infinit...
1396    [critic, shear, stress, infinit, long, simpli,...
1397    [stabil, rectangular, plat, shear, bend, forc,...
1398    [buckl, transvers, stiffen, plat, shear, paper...
1399    [buckl, shear, stress, simpli, support, infini...
Name: body, Length: 1400, dtype: object

**Creating Dictionary and Corpus**

In [None]:
#Creating dictionary and the word corpus
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]
#BoW preview
#Checking the words and the frequency of the words in the 1400th abstract of the document
corpus_1399 = corpus[1399]
for i in range(len(corpus_1399)):
    print("Word {} (\"{}\") appears {} time.".format(corpus_1399[i][0], 
                                               dictionary[corpus_1399[i][0]], 
                                               corpus_1399[i][1]))

Word 14 ("effect") appears 1 time.
Word 26 ("investig") appears 1 time.
Word 35 ("ratio") appears 1 time.
Word 47 ("support") appears 2 time.
Word 48 ("theoret") appears 1 time.
Word 67 ("flat") appears 1 time.
Word 84 ("plate") appears 1 time.
Word 92 ("shear") appears 3 time.
Word 100 ("treat") appears 1 time.
Word 119 ("uniform") appears 1 time.
Word 184 ("number") appears 1 time.
Word 194 ("size") appears 1 time.
Word 195 ("space") appears 2 time.
Word 230 ("rang") appears 1 time.
Word 241 ("complet") appears 1 time.
Word 250 ("extens") appears 1 time.
Word 274 ("transvers") appears 2 time.
Word 338 ("calcul") appears 1 time.
Word 355 ("stress") appears 2 time.
Word 366 ("bend") appears 1 time.
Word 377 ("elast") appears 1 time.
Word 387 ("form") appears 1 time.
Word 417 ("panel") appears 2 time.
Word 437 ("torsion") appears 1 time.
Word 445 ("buckl") appears 3 time.
Word 471 ("report") appears 1 time.
Word 541 ("infinit") appears 2 time.
Word 670 ("long") appears 3 time.
Word 680 

In [None]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dict.gensim')



In [None]:
import warnings
warnings.filterwarnings("ignore")

**Bulding the LDA Model with Number of Topics (n) = 10 with number of passes p=5**

In [None]:
NUM_TOPICS = 10
ldap5 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=5)
ldap5.save('model1p5.gensim')
topics = ldap5.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.029*"flow" + 0.026*"solut" + 0.019*"boundari" + 0.016*"problem"')
(1, '0.011*"wing" + 0.010*"give" + 0.009*"atmospher" + 0.009*"test"')
(2, '0.018*"theori" + 0.018*"buckl" + 0.017*"equat" + 0.017*"shell"')
(3, '0.027*"flow" + 0.024*"pressur" + 0.017*"shock" + 0.013*"number"')
(4, '0.041*"heat" + 0.029*"transfer" + 0.024*"layer" + 0.022*"boundari"')
(5, '0.025*"panel" + 0.011*"compressor" + 0.010*"speed" + 0.009*"ratio"')
(6, '0.015*"heat" + 0.014*"time" + 0.014*"temperatur" + 0.011*"flow"')
(7, '0.023*"number" + 0.019*"flow" + 0.019*"layer" + 0.019*"boundari"')
(8, '0.021*"pressur" + 0.019*"bodi" + 0.018*"cylind" + 0.014*"result"')
(9, '0.027*"wing" + 0.027*"lift" + 0.020*"flow" + 0.019*"drag"')


**Evaluating the metrics**

In [None]:
#Computing Perplexity
print('\nPerplexity: ', ldap5.log_perplexity(corpus))

#Computing Coherence Score
coherence_ldamodel = CoherenceModel(model=ldap5, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_ldamodel.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.683206133780252

Coherence Score:  0.33245680264970917


**Bulding the LDA Model with Number of Topics (n) = 10 with number of passes p=10**

In [None]:
NUM_TOPICS = 10
ldap10 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldap10.save('model1p10.gensim')
topics = ldap10.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.049*"wing" + 0.018*"flow" + 0.017*"lift" + 0.014*"ratio"')
(1, '0.017*"field" + 0.017*"heat" + 0.014*"effect" + 0.013*"flow"')
(2, '0.041*"layer" + 0.037*"boundari" + 0.026*"flow" + 0.018*"heat"')
(3, '0.034*"cylind" + 0.022*"flow" + 0.014*"equilibrium" + 0.012*"present"')
(4, '0.018*"flow" + 0.015*"heat" + 0.013*"number" + 0.012*"compressor"')
(5, '0.024*"buckl" + 0.021*"stress" + 0.017*"load" + 0.015*"shell"')
(6, '0.032*"solut" + 0.027*"equat" + 0.022*"flow" + 0.018*"problem"')
(7, '0.021*"pressur" + 0.016*"frequenc" + 0.015*"measur" + 0.014*"nois"')
(8, '0.032*"shock" + 0.022*"flow" + 0.018*"bodi" + 0.016*"mach"')
(9, '0.027*"angl" + 0.026*"pressur" + 0.025*"number" + 0.023*"test"')


In [None]:
#Computing Perplexity
print('\nPerplexity: ', ldap10.log_perplexity(corpus))

#Computing Coherence Score
coherence_ldamodel1 = CoherenceModel(model=ldap10, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda1 = coherence_ldamodel1.get_coherence()
print('\nCoherence Score: ', coherence_lda1)


Perplexity:  -6.623708570284236

Coherence Score:  0.3933318437000591


**Bulding the LDA Model with Number of Topics (n) = 10 with number of passes p=15**

In [None]:
NUM_TOPICS = 10
ldap15 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldap15.save('model1p15.gensim')
topics = ldap15.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.032*"layer" + 0.031*"boundari" + 0.030*"heat" + 0.023*"temperatur"')
(1, '0.030*"flow" + 0.021*"field" + 0.018*"veloc" + 0.016*"fluid"')
(2, '0.023*"flow" + 0.014*"shock" + 0.012*"equilibrium" + 0.009*"wave"')
(3, '0.029*"buckl" + 0.024*"shell" + 0.017*"cylind" + 0.015*"stress"')
(4, '0.051*"flow" + 0.026*"layer" + 0.025*"pressur" + 0.023*"boundari"')
(5, '0.043*"bodi" + 0.029*"shock" + 0.024*"flow" + 0.017*"hyperson"')
(6, '0.022*"wing" + 0.020*"method" + 0.017*"theori" + 0.014*"solut"')
(7, '0.027*"number" + 0.024*"pressur" + 0.023*"mach" + 0.021*"wing"')
(8, '0.017*"solut" + 0.015*"boundari" + 0.014*"veloc" + 0.012*"equat"')
(9, '0.027*"number" + 0.026*"flow" + 0.016*"reynold" + 0.015*"cylind"')


In [None]:
#Computing Perplexity
print('\nPerplexity: ', ldap15.log_perplexity(corpus))

#Computing Coherence Score
coherence_ldamodel2 = CoherenceModel(model=ldap15, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda2 = coherence_ldamodel2.get_coherence()
print('\nCoherence Score: ', coherence_lda2)


Perplexity:  -6.6140032257092525

Coherence Score:  0.4111690930450142


**Visualization of the model performance**

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldap15, corpus, dictionary)
vis

**Bulding the LDA Model with Number of Topics (n) = 10 with number of passes p=20**

In [None]:
NUM_TOPICS = 10
ldap20 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
ldap20.save('model1p20.gensim')
topics = ldap20.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.025*"pressur" + 0.024*"number" + 0.023*"flow" + 0.020*"layer"')
(1, '0.050*"flow" + 0.018*"veloc" + 0.016*"field" + 0.015*"case"')
(2, '0.014*"problem" + 0.011*"structur" + 0.011*"discuss" + 0.010*"speed"')
(3, '0.038*"grind" + 0.036*"flap" + 0.031*"thrust" + 0.021*"vehicl"')
(4, '0.028*"buckl" + 0.025*"stress" + 0.023*"load" + 0.018*"shell"')
(5, '0.040*"equat" + 0.034*"solut" + 0.027*"method" + 0.026*"problem"')
(6, '0.026*"number" + 0.020*"heat" + 0.016*"mach" + 0.015*"angl"')
(7, '0.084*"heat" + 0.062*"temperatur" + 0.028*"thermal" + 0.025*"atmospher"')
(8, '0.032*"flow" + 0.019*"boundari" + 0.019*"layer" + 0.018*"solut"')
(9, '0.043*"wing" + 0.020*"lift" + 0.015*"bodi" + 0.014*"ratio"')


In [None]:
#Computing Perplexity
print('\nPerplexity: ', ldap20.log_perplexity(corpus))

#Computing Coherence Score
coherence_ldamodel3 = CoherenceModel(model=ldap20, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda3 = coherence_ldamodel3.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.5888907077254775

Coherence Score:  0.40826038708234097


**Setting passes p = 15 for all the rest of the models**

**Bulding the LDA Model with Number of Topics (n) = 12**

In [None]:
NUM_TOPICS = 12
lda2 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
lda2.save('model2.gensim')
topics = lda2.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.051*"shock" + 0.030*"wave" + 0.029*"flow" + 0.019*"number"')
(1, '0.043*"wing" + 0.019*"number" + 0.017*"mach" + 0.016*"lift"')
(2, '0.025*"panel" + 0.016*"flutter" + 0.015*"speed" + 0.015*"propel"')
(3, '0.047*"pressur" + 0.036*"angl" + 0.030*"number" + 0.021*"mach"')
(4, '0.043*"layer" + 0.038*"boundari" + 0.029*"flow" + 0.018*"heat"')
(5, '0.035*"solut" + 0.031*"equat" + 0.024*"method" + 0.021*"flow"')
(6, '0.044*"flow" + 0.031*"nozzl" + 0.024*"field" + 0.020*"magnet"')
(7, '0.043*"transit" + 0.034*"rough" + 0.029*"number" + 0.022*"reynold"')
(8, '0.033*"heat" + 0.019*"flow" + 0.017*"temperatur" + 0.012*"equilibrium"')
(9, '0.029*"buckl" + 0.025*"shell" + 0.023*"load" + 0.022*"stress"')
(10, '0.060*"bodi" + 0.018*"theori" + 0.016*"flow" + 0.015*"shock"')
(11, '0.021*"structur" + 0.017*"temperatur" + 0.012*"heat" + 0.012*"stress"')


In [None]:
#Computing Perplexity
print('\nPerplexity: ', lda2.log_perplexity(corpus))

#Computing Coherence Score
coherence_lda2 = CoherenceModel(model=lda2, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda21 = coherence_ldamodel2.get_coherence()
print('\nCoherence Score: ', coherence_lda21)


Perplexity:  -6.584905264760732

Coherence Score:  0.4111690930450142


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda2, corpus, dictionary)
vis

**Bulding the LDA Model with Number of Topics (n) = 14**

In [None]:
NUM_TOPICS = 14
lda3 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
lda3.save('model3.gensim')
topics = lda3.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.028*"flow" + 0.015*"pressur" + 0.014*"wake" + 0.012*"compress"')
(1, '0.036*"nozzl" + 0.035*"pressur" + 0.027*"flow" + 0.014*"base"')
(2, '0.043*"wing" + 0.018*"number" + 0.017*"effect" + 0.016*"lift"')
(3, '0.034*"field" + 0.026*"flow" + 0.022*"magnet" + 0.018*"conduct"')
(4, '0.029*"solut" + 0.022*"method" + 0.022*"equat" + 0.016*"flow"')
(5, '0.019*"flow" + 0.014*"atmospher" + 0.013*"flight" + 0.012*"densiti"')
(6, '0.021*"stabil" + 0.021*"control" + 0.020*"wing" + 0.017*"valu"')
(7, '0.042*"layer" + 0.039*"boundari" + 0.024*"number" + 0.023*"flow"')
(8, '0.036*"shock" + 0.032*"bodi" + 0.026*"pressur" + 0.025*"flow"')
(9, '0.040*"buckl" + 0.036*"shell" + 0.027*"load" + 0.024*"cylind"')
(10, '0.041*"panel" + 0.021*"wave" + 0.019*"thermal" + 0.018*"stress"')
(11, '0.021*"structur" + 0.016*"turbul" + 0.015*"nois" + 0.012*"give"')
(12, '0.026*"flow" + 0.015*"pressur" + 0.013*"theori" + 0.012*"surfac"')
(13, '0.025*"edg" + 0.020*"lead" + 0.019*"lift" + 0.018*"heat"')


In [None]:
#Computing Perplexity
print('\nPerplexity: ', lda3.log_perplexity(corpus))

#Computing Coherence Score
coherence_lda3 = CoherenceModel(model=lda3, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda31 = coherence_lda3.get_coherence()
print('\nCoherence Score: ', coherence_lda31)


Perplexity:  -6.609835571948442

Coherence Score:  0.3776217977843431


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda3, corpus, dictionary)
vis

**Bulding the LDA Model with Number of Topics (n) = 16**

In [None]:
NUM_TOPICS = 16
lda4 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
lda4.save('model4.gensim')
topics = lda4.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.047*"flow" + 0.022*"method" + 0.020*"solut" + 0.017*"field"')
(1, '0.025*"theori" + 0.025*"flow" + 0.022*"function" + 0.018*"number"')
(2, '0.041*"shock" + 0.021*"number" + 0.018*"flow" + 0.018*"wave"')
(3, '0.020*"load" + 0.020*"plate" + 0.019*"plat" + 0.019*"edg"')
(4, '0.054*"flow" + 0.047*"edg" + 0.038*"lead" + 0.016*"hyperson"')
(5, '0.058*"bodi" + 0.020*"wing" + 0.014*"drag" + 0.013*"shape"')
(6, '0.019*"effect" + 0.015*"measur" + 0.013*"wing" + 0.011*"flight"')
(7, '0.043*"cylind" + 0.032*"buckl" + 0.026*"shell" + 0.022*"pressur"')
(8, '0.049*"wing" + 0.018*"ratio" + 0.017*"number" + 0.016*"pressur"')
(9, '0.026*"rough" + 0.021*"effect" + 0.019*"field" + 0.017*"magnet"')
(10, '0.038*"pressur" + 0.030*"number" + 0.026*"bodi" + 0.026*"mach"')
(11, '0.039*"boundari" + 0.037*"layer" + 0.026*"flow" + 0.019*"solut"')
(12, '0.034*"flutter" + 0.029*"panel" + 0.023*"stress" + 0.017*"thermal"')
(13, '0.072*"heat" + 0.036*"transfer" + 0.014*"rat" + 0.012*"ablat"')
(14, '0.042*"lift"

In [None]:
#Computing Perplexity
print('\nPerplexity: ', lda4.log_perplexity(corpus))

#Computing Coherence Score
coherence_lda4 = CoherenceModel(model=lda4, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda41 = coherence_lda4.get_coherence()
print('\nCoherence Score: ', coherence_lda41)


Perplexity:  -6.622182868880977

Coherence Score:  0.406997349145575


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda4, corpus, dictionary)
vis

**Bulding the LDA Model with Number of Topics (n) = 18**

In [None]:
NUM_TOPICS = 18
lda5 = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
lda5.save('model5.gensim')
topics = lda5.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.052*"layer" + 0.051*"boundari" + 0.029*"wall" + 0.024*"flow"')
(1, '0.056*"flutter" + 0.037*"panel" + 0.018*"ratio" + 0.018*"test"')
(2, '0.032*"solut" + 0.028*"equat" + 0.021*"flow" + 0.014*"approxim"')
(3, '0.021*"edg" + 0.015*"temperatur" + 0.012*"flow" + 0.012*"thermal"')
(4, '0.051*"transit" + 0.029*"rough" + 0.025*"number" + 0.024*"boundari"')
(5, '0.034*"load" + 0.028*"stress" + 0.026*"buckl" + 0.020*"cylind"')
(6, '0.012*"expans" + 0.011*"nose" + 0.009*"piston" + 0.009*"conic"')
(7, '0.055*"flow" + 0.018*"fluid" + 0.018*"field" + 0.014*"theori"')
(8, '0.072*"heat" + 0.044*"transfer" + 0.037*"temperatur" + 0.019*"solut"')
(9, '0.025*"flow" + 0.018*"shock" + 0.018*"layer" + 0.018*"heat"')
(10, '0.030*"bodi" + 0.024*"flow" + 0.023*"blade" + 0.022*"compressor"')
(11, '0.038*"wing" + 0.029*"lift" + 0.021*"bodi" + 0.017*"theori"')
(12, '0.020*"wing" + 0.019*"pressur" + 0.017*"effect" + 0.016*"propel"')
(13, '0.052*"skin" + 0.042*"friction" + 0.038*"boundari" + 0.034*"layer"')


In [None]:
#Computing Perplexity
print('\nPerplexity: ', lda5.log_perplexity(corpus))

#Computing Coherence Score
coherence_lda5 = CoherenceModel(model=lda5, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda51 = coherence_lda5.get_coherence()
print('\nCoherence Score: ', coherence_lda51)


Perplexity:  -6.643973885949119

Coherence Score:  0.40292039413112224


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda5, corpus, dictionary)
vis

**Evaluating the Model performance on the unseen data using the optimal model with n=12**

In [None]:
unseen_data1 = "Aerodynamic interaction between propellers of a distributed-propulsion system in forward flight"
corpus1 = dictionary.doc2bow(preprocess(unseen_data1))
for index, score in sorted(lda2[corpus1], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda2.print_topic(index, 2)))

Score: 0.5116443634033203	 Topic: 0.025*"panel" + 0.016*"flutter"
Score: 0.21517202258110046	 Topic: 0.043*"layer" + 0.038*"boundari"
Score: 0.17942878603935242	 Topic: 0.044*"flow" + 0.031*"nozzl"
Score: 0.010417656973004341	 Topic: 0.043*"wing" + 0.019*"number"
Score: 0.010417534038424492	 Topic: 0.047*"pressur" + 0.036*"angl"
Score: 0.010417312383651733	 Topic: 0.043*"transit" + 0.034*"rough"
Score: 0.010417270474135876	 Topic: 0.060*"bodi" + 0.018*"theori"
Score: 0.010417129844427109	 Topic: 0.021*"structur" + 0.017*"temperatur"
Score: 0.010417125187814236	 Topic: 0.051*"shock" + 0.030*"wave"
Score: 0.010417002253234386	 Topic: 0.033*"heat" + 0.019*"flow"
Score: 0.010416986420750618	 Topic: 0.035*"solut" + 0.031*"equat"
Score: 0.010416765697300434	 Topic: 0.029*"buckl" + 0.025*"shell"


In [None]:
unseen_data2 = "No One Can Explain Why Planes Stay in the Air"
corpus2 = dictionary.doc2bow(preprocess(unseen_data2))
for index, score in sorted(lda2[corpus2], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda2.print_topic(index, 2)))

Score: 0.6944403052330017	 Topic: 0.043*"wing" + 0.019*"number"
Score: 0.027779676020145416	 Topic: 0.029*"buckl" + 0.025*"shell"
Score: 0.02777908556163311	 Topic: 0.043*"layer" + 0.038*"boundari"
Score: 0.027778200805187225	 Topic: 0.025*"panel" + 0.016*"flutter"
Score: 0.027778001502156258	 Topic: 0.033*"heat" + 0.019*"flow"
Score: 0.027777977287769318	 Topic: 0.043*"transit" + 0.034*"rough"
Score: 0.02777782827615738	 Topic: 0.035*"solut" + 0.031*"equat"
Score: 0.027777789160609245	 Topic: 0.060*"bodi" + 0.018*"theori"
Score: 0.0277777761220932	 Topic: 0.051*"shock" + 0.030*"wave"
Score: 0.0277777761220932	 Topic: 0.047*"pressur" + 0.036*"angl"
Score: 0.0277777761220932	 Topic: 0.044*"flow" + 0.031*"nozzl"
Score: 0.0277777761220932	 Topic: 0.021*"structur" + 0.017*"temperatur"


In [None]:
unseen_data3 = "Turbulence Model Could Help Design Aircraft Capable of Handling Extreme Scenarios"
corpus3 = dictionary.doc2bow(preprocess(unseen_data3))
for index, score in sorted(lda2[corpus3], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda2.print_topic(index, 2)))

Score: 0.7481446862220764	 Topic: 0.021*"structur" + 0.017*"temperatur"
Score: 0.1592579334974289	 Topic: 0.029*"buckl" + 0.025*"shell"


In [None]:
unseen_data4 = "Zero emissions hydrogen plane test was part powered by fossil fuels"
corpus4 = dictionary.doc2bow(preprocess(unseen_data4))
for index, score in sorted(lda2[corpus4], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda2.print_topic(index, 2)))

Score: 0.532447874546051	 Topic: 0.025*"panel" + 0.016*"flutter"
Score: 0.23353157937526703	 Topic: 0.051*"shock" + 0.030*"wave"
Score: 0.14026734232902527	 Topic: 0.033*"heat" + 0.019*"flow"
Score: 0.010417166166007519	 Topic: 0.060*"bodi" + 0.018*"theori"
Score: 0.010417144745588303	 Topic: 0.047*"pressur" + 0.036*"angl"
Score: 0.010417127050459385	 Topic: 0.043*"layer" + 0.038*"boundari"
Score: 0.010417040437459946	 Topic: 0.035*"solut" + 0.031*"equat"
Score: 0.01041701901704073	 Topic: 0.044*"flow" + 0.031*"nozzl"
Score: 0.010417011566460133	 Topic: 0.043*"wing" + 0.019*"number"
Score: 0.010416969656944275	 Topic: 0.021*"structur" + 0.017*"temperatur"
Score: 0.010416954755783081	 Topic: 0.043*"transit" + 0.034*"rough"
Score: 0.010416810400784016	 Topic: 0.029*"buckl" + 0.025*"shell"
