In [88]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pyLDAvis.sklearn
import re
import sys
import wikipedia

print("Ok")

Ok


In [89]:
def get_topic(topic):
    pages = wikipedia.search(topic, results = 50, suggestion = True)[0]
    documents = []
    document_names = []
    for i in pages:
        print(i)
        try:
            page = wikipedia.page(i)
            documents.append(page.content)
            document_names.append(i)
        except Exception as e:
            print(e, file=sys.stderr)
    return documents, document_names

In [90]:

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in stopwords.words('english') and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text


def stem_text(text):
    ps = PorterStemmer()
    ret = []
    for word in text:
        ret.append(ps.stem(word))
    return ret


def join_text(text):
    return " ".join(text)

def get_documents_for_topic(topic):
    docs, doc_names = get_topic(topic)
    docs = tuple(map(clean_text, docs))
    docs = tuple(map(stem_text, docs))
    docs = tuple(map(join_text, docs))
    return docs, doc_names

  cleaned_text = [t for t in tokenized_text if t not in stopwords.words('english') and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]


In [91]:

docs, doc_names = get_documents_for_topic("Machine Learning in Natural Language Processing")

Natural language processing
History of natural language processing
Natural Language Toolkit
Natural-language understanding
Transformer (machine learning model)
Machine learning


Page id "machine ;earning" does not match any pages. Try another id!


Deep learning
ID3 algorithm
List of artificial intelligence projects
Zero-shot learning
Augmented Analytics


Page id "augmented analysis" does not match any pages. Try another id!


Natural language generation
BERT (language model)
Outline of natural language processing
Google Brain
Active learning (machine learning)
Wendy Lehnert
Stop word
ComplyAdvantage


Page id "comply advantage" does not match any pages. Try another id!


Federated learning
GPT-3


Page id "gtp 3" does not match any pages. Try another id!


Apache OpenNLP
Semantic decomposition (natural language processing)
Cognitive computing
Quantum machine learning
Self-supervised learning
Deep linguistic processing
Adversarial machine learning
List of datasets for machine-learning research
Hugging Face


Page id "huging face" does not match any pages. Try another id!


Language acquisition
Reinforcement learning
Empirical Methods in Natural Language Processing
Mohamed bin Zayed University of Artificial Intelligence
Machine translation
Recorded Future
Document processing
Outline of machine learning
Timeline of machine learning
Pachinko allocation
Language model
Language identification
GloVe
Grammar induction
Semantic analysis (machine learning)
Never-Ending Language Learning
Machine learning in bioinformatics
Gensim
List of programming languages for artificial intelligence
Word embedding


## Bag of words

In [92]:
def print_vectorizer_method(vectorizer, X, method):
    print(f"Method {method}")
    print("Shape: documents/words", X.shape)
    features_bag_of_words = vectorizer.get_feature_names_out()
    print(f"Features length is {len(features_bag_of_words)}:", features_bag_of_words)
    print('\n')

def create_bag_of_words_vectorizer(docs):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(docs)
    print_vectorizer_method(vectorizer, X, "Bag of Words")
    return vectorizer, X


def create_tf_idf_vectorizer(docs):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(docs)
    print_vectorizer_method(vectorizer, X, "TF IDF")
    return vectorizer, X


def create_vectorizer(docs, name):
    if name == "bag_of_words":
        return create_bag_of_words_vectorizer(docs)
    if name == "tf_idf":
        return create_tf_idf_vectorizer(docs)
    raise f"Not implemented, {name}"


create_vectorizer(docs, "bag_of_words")
create_vectorizer(docs, "tf_idf")
print("Done")

Method Bag of Words
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']


Method TF IDF
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']


Done


In [93]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}")
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
    print('\n')

In [94]:
def create_model(n_components, X, model):
    if model == "SVD":
        model = TruncatedSVD(n_components=n_components)
    elif model == "NMF":
        model = NMF(n_components=n_components)
    else:
        raise f"{model} does not exist"

    return model, model.fit_transform(X)


In [95]:
NUM_TOPICS = 5

def create_and_print(docs, topics,method, factorizer):
    vectorizer, X = create_vectorizer(docs, method)
    model, fitted = create_model(topics, X, factorizer)
    print(fitted.shape)
    print(fitted[0])
    print_topics(model, vectorizer)
    return vectorizer, X, fitted


In [96]:
create_and_print(docs, NUM_TOPICS, "bag_of_words", "SVD")

Method Bag of Words
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']


(45, 5)
[131.58904347  50.21249044  11.57746624  -7.30621593   9.15464329]
Topic 0
[('learn', 0.4222048965871562), ('languag', 0.3932539096636138), ('use', 0.23133639891520447), ('machin', 0.19919767453965118), ('model', 0.19886750878350606), ('comput', 0.14687382141186475), ('data', 0.14414836971271344), ('process', 0.14378820542602858), ('translat', 0.14257049501101787), ('word', 0.12682523557777944)]
Topic 1
[('languag', 0.5925319415741599), ('translat', 0.19524263967564387), ('word', 0.16165770193574094), ('text', 0.12317378633057023), ('acquisit', 0.10212789434293139), ('linguist', 0.09977986007013427), ('children', 0.083183911958335), ('natural', 0.08203474077947023), ('human', 0.07964457767705851), ('grammar', 0.06830680594439101)]
Topic 2
[('translat', 0.6528689235919222), ('machin', 0.2926536538524599), ('text', 0.15122557074683055), ('

(CountVectorizer(),
 <45x6258 sparse matrix of type '<class 'numpy.int64'>'
 	with 21019 stored elements in Compressed Sparse Row format>,
 array([[ 1.31589043e+02,  5.02124904e+01,  1.15774662e+01,
         -7.30621593e+00,  9.15464329e+00],
        [ 2.83151646e+01,  5.84454584e+00,  8.79324618e+00,
          1.29349843e+00, -2.30918629e+00],
        [ 6.11141969e+00,  2.89424282e+00,  1.38962726e-01,
         -1.65392130e-02,  2.82769868e-01],
        [ 3.68931872e+01,  2.23143268e+01,  8.66568191e-01,
         -1.37133159e+00,  8.43436271e+00],
        [ 5.18839528e+01, -8.37679423e+00,  6.52931374e+00,
         -1.49806814e+01,  9.66174594e+00],
        [ 2.32236723e+02, -1.30566757e+02,  8.33935959e+00,
         -1.23471073e+02,  1.00967250e+02],
        [ 2.11410003e+01, -1.08371789e+01,  1.33208436e+00,
          2.65603180e+00, -1.36388495e+01],
        [ 3.42238519e+01, -1.79309685e+00,  2.56997391e+00,
         -4.10924607e+00,  4.82363703e+00],
        [ 2.58099639e+01, -5.

In [97]:

output_notebook()

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


In [98]:

def plot(method, data):
    df = pd.DataFrame(columns=['x', 'y', 'index'])
    df['x'], df['y'], df['index'] = method[:,0], method[:,1], data

    source = ColumnDataSource(ColumnDataSource.from_df(df))

    labels = LabelSet(x="x", y="y", text="index", y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')

    plot = figure(plot_width=600, plot_height=600)
    plot.circle("x", "y", size=12, source=source, line_color="black",
                fill_alpha=0.8)
    plot.add_layout(labels)
    show(plot, notebook_handle=True)

def plot2(data, vectorizer):
    svd = TruncatedSVD(n_components=2)
    words_2d = svd.fit_transform(data.T)

    df = pd.DataFrame(columns=['x', 'y', 'word'])
    df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()

    source = ColumnDataSource(ColumnDataSource.from_df(df))
    labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')

    plot = figure(plot_width=600, plot_height=600)
    plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
    plot.add_layout(labels)
    show(plot, notebook_handle=True)


def create_and_print_and_plot(docs, doc_names, topics,method, factorizer):
    vectorizer, X, fitted = create_and_print(docs, topics, method, factorizer)
    plot(fitted, doc_names)
    plot2(X, vectorizer)

In [99]:
create_and_print_and_plot(docs, doc_names, NUM_TOPICS, "bag_of_words", "SVD")

Method Bag of Words
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']


(45, 5)
[131.58904357  50.21248875  11.57700339  -7.30131065   9.15523196]
Topic 0
[('learn', 0.4222048967034296), ('languag', 0.3932539093999371), ('use', 0.2313363987340108), ('machin', 0.19919767450077502), ('model', 0.19886750832953248), ('comput', 0.14687382136937505), ('data', 0.14414836976743642), ('process', 0.14378820564628397), ('translat', 0.1425704950241156), ('word', 0.12682523562315812)]
Topic 1
[('languag', 0.5925319272010949), ('translat', 0.19524263739955183), ('word', 0.16165775087175907), ('text', 0.1231737862891163), ('acquisit', 0.10212789825135099), ('linguist', 0.09977986495276488), ('children', 0.08318391514578767), ('natural', 0.08203474952871173), ('human', 0.07964457656090479), ('grammar', 0.06830679683212128)]
Topic 2
[('translat', 0.6528688112127259), ('machin', 0.2926537387122905), ('text', 0.15122641290902644), ('u

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


In [100]:
create_and_print_and_plot(docs, doc_names, NUM_TOPICS, "tf_idf", "SVD")

Method TF IDF
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']


(45, 5)
[ 0.73431886 -0.27870096  0.06037902 -0.03885453 -0.10052946]
Topic 0
[('languag', 0.29533578941218463), ('learn', 0.2479156910669304), ('use', 0.1947819511058322), ('word', 0.15591079241546135), ('model', 0.1558312942490691), ('process', 0.13782834924608314), ('system', 0.13427071735294413), ('machin', 0.13422719327485988), ('data', 0.13243941684288993), ('translat', 0.12032628884257603)]
Topic 1
[('learn', 0.33865310662762965), ('data', 0.17407754285046964), ('algorithm', 0.15299129330721956), ('label', 0.13869022258461303), ('dataset', 0.1226947833371157), ('attack', 0.11469861234496774), ('feder', 0.1111661668741523), ('quantum', 0.1080553679835218), ('model', 0.10635025813323154), ('node', 0.10456195272339439)]
Topic 2
[('word', 0.35462772867851594), ('bert', 0.19917532568303947), ('embed', 0.19043844353892636), ('model', 0.17349223625793

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


In [101]:
create_and_print_and_plot(docs, doc_names, NUM_TOPICS, "bag_of_words", "NMF")

Method Bag of Words
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']


(45, 5)
[0.33026179 2.50249587 2.29463152 0.06211282 0.92997122]
Topic 0
[('learn', 15.402628169872957), ('model', 8.076199277666367), ('data', 6.525116780339224), ('algorithm', 6.146797779494227), ('machin', 5.262857804371684), ('feder', 3.9528858156892785), ('use', 3.935595490456424), ('local', 3.7228213375796733), ('node', 3.3143672664698447), ('cluster', 2.9417397052856944)]
Topic 1
[('languag', 25.289826432394566), ('learn', 6.898022251252406), ('word', 6.684759391265246), ('acquisit', 5.555643961858254), ('process', 4.7121448636360315), ('children', 4.498684576669163), ('use', 4.38058973731216), ('speech', 3.938276215714694), ('human', 3.897272244715339), ('linguist', 3.612192091143066)]
Topic 2
[('translat', 14.898577450354693), ('machin', 7.408474072757111), ('languag', 5.811092361376236), ('use', 4.611602543029759), ('text', 4.420669831

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


In [102]:
create_and_print_and_plot(docs, doc_names, NUM_TOPICS, "tf_idf", "NMF")


Method TF IDF
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']


(45, 5)
[0.43877869 0.05825742 0.10026808 0.01358966 0.0898332 ]
Topic 0
[('languag', 0.7186116074701042), ('translat', 0.3049536596207494), ('text', 0.28485633054893933), ('system', 0.25125666242115485), ('natural', 0.2511645658454375), ('grammar', 0.250908700005121), ('linguist', 0.2426940743537118), ('use', 0.2214565530316611), ('process', 0.21978335717013345), ('word', 0.17691633658967118)]
Topic 1
[('learn', 0.4935229593112749), ('data', 0.26210826713228247), ('algorithm', 0.23608072179498177), ('model', 0.18601746047253576), ('use', 0.17109856204670554), ('label', 0.1691799809641371), ('dataset', 0.16158446166249488), ('exampl', 0.14352350612182121), ('machin', 0.13932765633246671), ('node', 0.1327518728750316)]
Topic 2
[('word', 0.5129167342710286), ('bert', 0.3469464079219475), ('embed', 0.27001518059799867), ('model', 0.24099882479907514), ('

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


In [103]:
def LDA(docs, n_components, method):
    vectorizer, X = create_vectorizer(docs, method)
    model = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method='online')
    lda = model.fit_transform(X)
    return model, vectorizer, X


def create_panel(docs, n_components, method):
    model, vectorizer, X = LDA(docs, n_components, method)
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(model, X, vectorizer, mds='mmds')
    return panel


In [104]:

create_panel(docs, NUM_TOPICS, "bag_of_words")


Method Bag of Words
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']




  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [105]:
create_panel(docs, NUM_TOPICS, "tf_idf")

Method TF IDF
Shape: documents/words (45, 6258)
Features length is 6258: ['0011526042365591' '0033' '009' ... 'zsl' 'zurada' 'zurich']




  by='saliency', ascending=False).head(R).drop('saliency', 1)


We observe that 1:
* a
* b
* c


In [106]:
def all_in_one(topic):
    num_topics = 5
    docs, doc_names = get_documents_for_topic(topic)
    create_and_print_and_plot(docs, doc_names, num_topics, "bag_of_words", "SVD")
    create_and_print_and_plot(docs, doc_names, num_topics, "tf_idf", "SVD")
    create_and_print_and_plot(docs, doc_names, num_topics, "bag_of_words", "NMF")
    create_and_print_and_plot(docs, doc_names, num_topics, "tf_idf", "NMF")
    p1 = create_panel(docs, num_topics, "bag_of_words")
    p2 = create_panel(docs, num_topics, "tf_idf")
    return p1, p2  # panels must be evaluated to be shown in jupyter

In [107]:
p1, p2 = all_in_one("Machine Learning in Medicine")


Machine learning


Page id "machine ;earning" does not match any pages. Try another id!


Logic learning machine
Deep learning
List of datasets for machine-learning research
Federated learning
Personalized medicine
Machine learning in bioinformatics
Jordan Harrod
Ensemble learning
Fei-Fei Li
Causal inference
Suchi Saria
Artificial intelligence in healthcare
Learning


Page id "learnning" does not match any pages. Try another id!


Explainable artificial intelligence
Medicine
Rote learning
Applications of artificial intelligence
Data mining


Page id "data maining" does not match any pages. Try another id!


Artificial intelligence
Data augmentation
Mihaela van der Schaar
Edward Chang (neurosurgeon)
Rachel Thomas (academic)
In silico clinical trials
Bias–variance tradeoff
Google Neural Machine Translation
Gradient boosting
Artificial neural network
Convolutional neural network
VITAL (machine learning software)
Neural machine translation
Recurrent neural network
Elastic net regularization
S. Joshua Swamidass
List of Dr. Quinn, Medicine Woman episodes
Higher education


Page id "high education" does not match any pages. Try another id!


Alternative medicine
Genevera Allen
Learning classifier system
U-Net
Doctor of Medicine
Education
Association rule learning
Autodidacticism
Marzyeh Ghassemi
Residency (medicine)
Daniela Witten


Page id "daniella written" does not match any pages. Try another id!


Text nailing
List of programming languages for artificial intelligence
Method Bag of Words
Shape: documents/words (45, 7996)
Features length is 7996: ['00' '00010' '0003' ... 'zone' 'zones' 'zuckerberg']


(45, 5)
[  9.19332004 -10.04458341  -0.5449436   -0.38382612   1.49720193]
Topic 0
[('medic', 0.2926050862853539), ('medicin', 0.2855739901840487), ('year', 0.23385342286166524), ('use', 0.22343498495412997), ('learn', 0.21447008900011058), ('educ', 0.20202925588663626), ('train', 0.18185363987074588), ('degre', 0.1724505037616801), ('school', 0.13537054939163892), ('univers', 0.13210665374479863)]
Topic 1
[('medic', 0.30105273557957585), ('year', 0.25338031193703836), ('medicin', 0.23337102588867573), ('degre', 0.21083880271382047), ('doctor', 0.13924132743252898), ('resid', 0.13563481111287115), ('univers', 0.1169133643927762), ('school', 0.08751795918256965), ('student', 0.08707901983254564), ('specialti', 0.08426893686723283)]
Topic 2
[('educ', 0.7651726744077761), ('school', 0.2

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method TF IDF
Shape: documents/words (45, 7996)
Features length is 7996: ['00' '00010' '0003' ... 'zone' 'zones' 'zuckerberg']


(45, 5)
[ 0.37544448 -0.10313653  0.06744983  0.15341639  0.10006324]
Topic 0
[('learn', 0.26041371424723153), ('use', 0.2567034311174599), ('network', 0.1954177591215769), ('data', 0.18108672687114113), ('model', 0.15820139846420236), ('neural', 0.13798764303665875), ('machin', 0.13621343384806253), ('algorithm', 0.12299432590518831), ('train', 0.12136042231039385), ('layer', 0.1209215493374308)]
Topic 1
[('educ', 0.22932349140752678), ('medicin', 0.20839462039506804), ('medic', 0.16200411901075906), ('univers', 0.14762160595147683), ('school', 0.14757364886138416), ('year', 0.1386713356415932), ('resid', 0.12753691260979427), ('degre', 0.12525768464052492), ('award', 0.11438356451686364), ('patient', 0.11010322197155388)]
Topic 2
[('patient', 0.15702500875199654), ('use', 0.14904517978261395), ('data', 0.14093764606784906), ('rule', 0.11691461237612133), ('

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method Bag of Words
Shape: documents/words (45, 7996)
Features length is 7996: ['00' '00010' '0003' ... 'zone' 'zones' 'zuckerberg']


(45, 5)
[0.         0.41476838 0.03389231 0.01955733 0.        ]
Topic 0
[('medic', 15.418158100232638), ('degre', 13.159651438975793), ('year', 13.146340308544096), ('medicin', 10.008459889959509), ('doctor', 8.615477535334445), ('univers', 7.888003486887689), ('student', 5.859298881287612), ('school', 5.732696391576867), ('award', 4.43357646480892), ('train', 4.206712019227059)]
Topic 1
[('learn', 11.85273488163515), ('use', 10.146385157358695), ('network', 8.890149386704268), ('data', 6.048071707045842), ('model', 5.621233087132003), ('neural', 5.535339584153799), ('layer', 5.450830662629992), ('deep', 4.694759934113004), ('train', 4.4692621387383), ('algorithm', 3.970370365839597)]
Topic 2
[('educ', 33.50514870195303), ('school', 10.758962634623828), ('learn', 8.693409797959182), ('student', 6.042267672264628), ('develop', 5.300164843227252), ('form

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method TF IDF
Shape: documents/words (45, 7996)
Features length is 7996: ['00' '00010' '0003' ... 'zone' 'zones' 'zuckerberg']






(45, 5)
[0.         0.         0.047357   0.36545668 0.01694479]
Topic 0
[('patient', 0.4135576366951046), ('medicin', 0.3434879758543704), ('use', 0.30831017768139923), ('treatment', 0.29093831144038046), ('diseas', 0.259252453797889), ('drug', 0.24406101572840716), ('trial', 0.23245251397904063), ('effect', 0.21095705933445807), ('therapi', 0.2080563570872546), ('clinic', 0.20659865610681255)]
Topic 1
[('educ', 0.48491263470392604), ('school', 0.28634004320366696), ('univers', 0.252647419017816), ('year', 0.24464699065141432), ('resid', 0.22708685363907044), ('student', 0.20560061053940046), ('degre', 0.2015800310278946), ('medic', 0.19406664561999507), ('graduat', 0.1829027861683966), ('medicin', 0.1592033163626284)]
Topic 2
[('network', 0.5661634947559896), ('layer', 0.4901321528686389), ('neural', 0.3186190294736602), ('neuron', 0.30112815744154103), ('convolut', 0.23051142982591055), ('imag', 0.23015958858242194), ('learn', 0.21994010930049077), ('weight', 0.2011254486669675), ('

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method Bag of Words
Shape: documents/words (45, 7996)
Features length is 7996: ['00' '00010' '0003' ... 'zone' 'zones' 'zuckerberg']




  by='saliency', ascending=False).head(R).drop('saliency', 1)


Method TF IDF
Shape: documents/words (45, 7996)
Features length is 7996: ['00' '00010' '0003' ... 'zone' 'zones' 'zuckerberg']




  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [108]:
p1

In [109]:
p2

We observe that 2:
* a
* b
* c

In [110]:
p1, p2 = all_in_one("Shadows of the past")


A Shadow of the Past
The Shadow of the Past
Shadows of the Past
Shadows of the Past (album)
La sombra del pasado
Shadow of the Past
Sentenced
The Shadows
Shadows of the Past (1991 film)
Shadows of the Past (1922 film)


Page id "shadows of the past 1936 film" does not match any pages. Try another id!


Shadows of Forgotten Ancestors
Shadows of the Past (1936 film)
Frank Chase (screenwriter)
Erika Anderson
Stronger Than Ever (album)
Rings of Power
Christopher Paolini


Page id "christopher paola i" does not match any pages. Try another id!


Shadow Chancellor of the Exchequer
Dark Shadows
Politics of India
The Shadow of Her Past
Fátima Guedes
C tuning (guitar)
Lords of the Trident
List of The Hobbit characters
Abilene (film)




  lis = BeautifulSoup(html).find_all('li')
"Abilene" may refer to: 
Abilene, Kansas
Abilene, Texas
Abilene, Texas metropolitan area
Abilene, Virginia
Abilene State Park
Abilene Trail
Abilene (biblical)
Abilene, Alberta
Abilene Town
Gunfighters of Abilene
Gunfight in Abilene
Abilene (film)
Operation Abilene (1966)
Operation Abilene (2003)
USS Abilene (PF-58)
Abilene Trophy
Abilene Christian University
Abilene Network
Abilene paradox
"Abilene" (song)
Abilene and Smoky Valley Railroad
Abilene and Southern Railway


Vala (Middle-earth)
The Lord of the Rings: The Rings of Power (soundtrack)
Disengagement


"Disengagement" may refer to: 
Apathy
Disengagement theory
Moral disengagement
Religious disengagement
Social disengagement
Disengagement (military)
Disengagement (engineering)
Disengagement (politics)
Israeli disengagement plan (disambiguation)
Israeli disengagement from Gaza
Superpower disengagement
Disengagement (film)
Shadows of the Past
Engagement (disambiguation)
Engage (disambiguation)
Engaged (disambiguation)
Disengage (disambiguation)
All pages with titles beginning with Disengagement
All pages with titles containing Disengagement


The Virginian (TV series)
Sauron
Shadow Star
Lincoln County Regulators
One Ring
The Lord of the Rings: The Rings of Power
The Lord of the Rings Online
Batman: The Animated Series
Universal's Halloween Horror Nights
Ent


"EN" may refer to: 
Bouygues
Esquimalt and Nanaimo Railway
Euronews
N
EN (cuneiform)
En (Cyrillic)
En (digraph)
En (typography)
en dash
En language
English language
Eastern National
English Nature
Envirolink Northwest
En (deity)
Engineer
En (Lie algebra)
EN standards
Electroless nickel plating
Electronegativity
Engrailed (gene)
Erythema nodosum
Ethylenediamine
newtons
Endangered species
EuroNight
Yen
Empty net goal
licensed practical nurse
English Wikipedia
Air Dolomiti
En Esch
song by Arca


Silent Hill: Homecoming
Shadows of Mordor
The Council of Elrond
Mike Terrana
Middle-earth: Shadow of War
Saruman


"salman" may refer to: 
Salman (name)
Salman, Khuzestan
Deh-e Salman, Lorestan
Salman, Razavi Khorasan
Salami, Iran
Salman, Semnan
Salman, Tehran
Salman, Zanjan
Salman (myth)
All pages with titles beginning with Salman
All pages with titles containing Salman
David S. Weiss
Salmon
Salmon (disambiguation)
Solomon (disambiguation)


Aragorn
The History of The Lord of the Rings
On the Hills of Manchuria
Shadow the Hedgehog (video game)
Star Wars: Shadows of the Empire
Method Bag of Words
Shape: documents/words (44, 9124)
Features length is 9124: ['10' '15' '150m' ... 'zucco' 'zurita' 'zwangobani']


(44, 5)
[56.51314747 -8.84541247  0.30505427 -3.7983361   1.14168626]
Topic 0
[('seri', 0.42181897203383933), ('film', 0.29430956396315533), ('ring', 0.24716591043520003), ('tolkien', 0.21438964162882665), ('season', 0.17826936730761184), ('new', 0.17514477097362047), ('episod', 0.1721991404507196), ('amazon', 0.16713467967793252), ('first', 0.15303935120465592), ('lord', 0.1209389677937119)]
Topic 1
[('batman', 0.3259743027722725), ('seri', 0.28914115287158915), ('episod', 0.18789043133373837), ('anim', 0.18764621015456184), ('comic', 0.11577360132755134), ('releas', 0.11563000921623934), ('show', 0.08823936480473939), ('featur', 0.07423864200411791), ('origin', 0.07090291574829274), ('volum', 0.06922393084873017)]
Top

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method TF IDF
Shape: documents/words (44, 9124)
Features length is 9124: ['10' '15' '150m' ... 'zucco' 'zurita' 'zwangobani']


(44, 5)
[ 0.68277516 -0.26922723 -0.37242246 -0.13548706 -0.32846785]
Topic 0
[('ring', 0.4675095396699467), ('tolkien', 0.30603141661246996), ('seri', 0.17376190518211718), ('lord', 0.1676976828691399), ('episod', 0.16060734475767804), ('film', 0.1335962179459887), ('hobbit', 0.12589254194160626), ('frodo', 0.12129371863798717), ('power', 0.11793297293716186), ('amazon', 0.11501791108809555)]
Topic 1
[('ring', 0.37771869558977406), ('tolkien', 0.216162108062162), ('frodo', 0.17407499235319918), ('chapter', 0.13420231666017632), ('gandalf', 0.1214280947352782), ('gollum', 0.10179969093312083), ('bilbo', 0.07746825772396446), ('sauron', 0.07418690390645585), ('hobbit', 0.06605123269817308), ('shippey', 0.06514507905061664)]
Topic 2
[('band', 0.3725393997052118), ('album', 0.26804672128892193), ('releas', 0.1543417583010346), ('game', 0.14348129516247995), ('sha

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method Bag of Words
Shape: documents/words (44, 9124)
Features length is 9124: ['10' '15' '150m' ... 'zucco' 'zurita' 'zwangobani']


(44, 5)
[1.73469378 0.34375645 0.01748009 0.25937832 0.        ]
Topic 0
[('seri', 10.513507812603184), ('film', 8.607581354863031), ('amazon', 5.384329358187884), ('season', 5.298466772507729), ('tolkien', 5.266014539715211), ('new', 4.890769231432493), ('first', 3.885382327954921), ('episod', 3.7770752187243546), ('ring', 3.501870690597151), ('cast', 3.4461270396586827)]
Topic 1
[('ring', 42.16503108225979), ('tolkien', 15.694701636405046), ('power', 13.57273091910117), ('sauron', 10.488557545101692), ('one', 8.738716545462331), ('lord', 7.0620980683421095), ('frodo', 6.660282345799035), ('gollum', 5.0038362326469), ('gandalf', 4.674008031597145), ('hobbit', 4.566470209818285)]
Topic 2
[('aragon', 12.625876532082998), ('aragones', 4.126103441857189), ('zaragoza', 3.796015166508614), ('area', 3.04932731288305), ('parti', 2.77443824111884), ('river', 2.5

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method TF IDF
Shape: documents/words (44, 9124)
Features length is 9124: ['10' '15' '150m' ... 'zucco' 'zurita' 'zwangobani']


(44, 5)
[0.        0.6111464 0.        0.        0.       ]
Topic 0
[('ring', 1.0058545692805332), ('tolkien', 0.6613922985808689), ('frodo', 0.381322654725797), ('chapter', 0.32489429429299704), ('gandalf', 0.2744837950843433), ('lord', 0.22658525702016039), ('power', 0.22498942618345508), ('gollum', 0.2212446519102052), ('hobbit', 0.21771488169569), ('book', 0.2128216229556829)]
Topic 1
[('seri', 0.4094138203154851), ('amazon', 0.3966574528376612), ('episod', 0.35644690679567553), ('galadriel', 0.34090746332464533), ('ring', 0.32395635222297237), ('season', 0.3084668377711993), ('tolkien', 0.2825003081449448), ('film', 0.28037806065740534), ('lord', 0.2354452854077452), ('mckay', 0.22795650836987078)]
Topic 2
[('band', 0.5561030146701677), ('album', 0.40950200225467254), ('releas', 0.24971046033828015), ('sentenc', 0.1897531758959769), ('guitar', 0.185514463

  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


Method Bag of Words
Shape: documents/words (44, 9124)
Features length is 9124: ['10' '15' '150m' ... 'zucco' 'zurita' 'zwangobani']




  by='saliency', ascending=False).head(R).drop('saliency', 1)


Method TF IDF
Shape: documents/words (44, 9124)
Features length is 9124: ['10' '15' '150m' ... 'zucco' 'zurita' 'zwangobani']




  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [111]:
p1

In [112]:
p2

We observe that 3:
* a
* b
* c