In [2]:
text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [4]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
print(processed_corpus)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [7]:
from gensim import corpora
import pprint
dictionary = corpora.Dictionary(processed_corpus)
pprint.pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


In [8]:
# note that "interaction" does not appear since it's not found in the original dictionary.
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [9]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [10]:
from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the "system minors" string
words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


In [11]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

In [12]:
query_document = 'system engineering'.split()
query_bow = dictionary.doc2bow(query_document)
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))

[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [13]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [14]:
print(len(docs))
print(docs[0][:500])

1740
387 
Neural Net and Traditional Classifiers  
William Y. Huang and Richard P. Lippmann 
MIT Lincoln Laboratory 
Lexington, MA 02173, USA 
Abstract
Previous work on nets with continuous-valued inputs led to generative 
procedures to construct convex decision regions with two-layer percepttons (one hidden 
layer) and arbitrary decision regions with three-layer percepttons (two hidden layers). 
Here we demonstrate that two-layer perceptton classifiers trained with back propagation 
can form both c


In [15]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [16]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [17]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [18]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [19]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [20]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8644
Number of documents: 1740


In [21]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [23]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)


Average topic coherence: -1.1092.
[([(0.009056574, 'gaussian'),
   (0.007318024, 'mixture'),
   (0.006725519, 'density'),
   (0.006318043, 'likelihood'),
   (0.005801836, 'prior'),
   (0.005217883, 'bayesian'),
   (0.0051224073, 'matrix'),
   (0.004886573, 'estimate'),
   (0.004717484, 'log'),
   (0.004664063, 'component'),
   (0.004513822, 'em'),
   (0.004492974, 'class'),
   (0.004217351, 'posterior'),
   (0.004130018, 'approximation'),
   (0.003884574, 'sample'),
   (0.0037637085, 'variance'),
   (0.0036264947, 'noise'),
   (0.003384305, 'estimation'),
   (0.0032502722, 'maximum'),
   (0.0031151897, 'covariance')],
  -0.8207048285312659),
 ([(0.0121412305, 'hidden'),
   (0.007546479, 'layer'),
   (0.006528153, 'speech'),
   (0.0064861877, 'recognition'),
   (0.006247099, 'hidden_unit'),
   (0.005939379, 'net'),
   (0.0051850905, 'rule'),
   (0.0051401095, 'trained'),
   (0.0049133054, 'word'),
   (0.004616797, 'architecture'),
   (0.003912561, 'sequence'),
   (0.0037932412, 'classif

In [33]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Load the CSV file with tokenized content
df = pd.read_csv("out.csv")

# Create a list of tokenized documents
tokenized_docs = [doc.split() for doc in df["processed_content"].values]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(tokenized_docs)

# Filter out tokens that appear in less than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Set number of topics
num_topics = 10

# Build the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         workers=3,  # Adjust based on your system
                         passes=100)  # Number of passes through the corpus

# Print the top 5 terms for each topic
for idx, topic in lda_model.print_topics(-1):
    terms = topic.split("+")
    terms = [term.split("*")[1].strip().replace('"', '') for term in terms][:5]
    print("Topic {}: {}".format(idx, ", ".join(terms)))

# Save the model if needed
# lda_model.save("lda_model")


Topic 0: 'incepe',, 'loc',, 'transmite',, 'respinge',, 'spune',
Topic 1: 'urma',, 'respinge',, 'transmite',, 'spune',, 'anunta',
Topic 2: 'loc',, 'comunicat',, 'sistem',, 'intru',, 'respinge',
Topic 3: 'respinge',, 'loc',, 'transmite',, 'incepe',, 'spune',
Topic 4: 'respinge',, 'anunta',, 'incepe',, 'crestere',, 'urma',
Topic 5: 'spune',, 'transmite',, 'urma',, 'sistem',, 'loc',
Topic 6: 'spune',, 'transmite',, 'crestere',, 'intru',, 'comunicat',
Topic 7: 'incepe',, 'spune',, 'loc',, 'respinge',, 'transmite',
Topic 8: 'urma',, 'transmite',, 'loc',, 'anunta',, 'incepe',
Topic 9: 'incepe',, 'loc',, 'spune',, 'urma',, 'transmite',
