# Ex 06: Topic Modeling

using Latent Dirichlet Allocation (LDA)
and Combined Topic Models (CTM).

## 1 Set up

In [None]:
!pip install contextualized-topic-models==2.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contextualized-topic-models==2.2.0
  Downloading contextualized_topic_models-2.2.0-py2.py3-none-any.whl (33 kB)
Collecting sentence-transformers>=1.1.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.2 MB/s 
[?25hCollecting gensim>=3.8.3
  Downloading gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.3 MB/s 
[?25hCollecting ipywidgets==7.5.1
  Downloading ipywidgets-7.5.1-py2.py3-none-any.whl (121 kB)
[K     |████████████████████████████████| 121 kB 15.2 MB/s 
Collecting ipython==7.16.1
  Downloading ipython-7.16.1-py3-none-any.whl (785 kB)
[K     |████████████████████████████████| 785 kB 13.4 MB/s 
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 18.2 MB/

## Import General Utility Libraries 

In [None]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Where to store the data file. If you want, you can adjust the path.

In [None]:
path_before_1990 = '/content/drive/MyDrive/Colab Notebooks/ML4NLP/EX6/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/MyDrive/Colab Notebooks/ML4NLP/EX6/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/MyDrive/Colab Notebooks/ML4NLP/EX6/titles_from_2010.txt'

Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

In [None]:
# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load 


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.
    
    Divide the papers into 3 time periods. 
    
    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title: 
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:  
                    # only include titles with at least four words
                    continue
                got_title = True
        
        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010
    
    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

KeyboardInterrupt: ignored

## 2 Topic Modelling

### 2.1 LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 10

Let's perform some simple preprocessing:

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text


In [None]:
NUM_FEATURES = 10000
MAX_DF=0.5
MIN_DF=0.01
NUM_LDA_TOPIC=10
class LDA_tm:
    def __init__(self, num_features=NUM_FEATURES, max_df=MAX_DF, min_df=MIN_DF):
        self.num_features = num_features
        self.max_df = max_df
        self.min_df = min_df
        self.lda = None
        self.tf = None
        self.tf_feature_names = None
    def generate_tf(self, prepro_titles):
        tf_vectorizer = CountVectorizer(max_df=self.max_df, min_df=self.min_df, max_features=self.num_features, stop_words='english')
        self.tf = tf_vectorizer.fit_transform(prepro_titles)
        self.tf_feature_names = tf_vectorizer.get_feature_names_out()
    def get_topic(self, n_components=NUM_LDA_TOPIC, max_iter=5):
        self.lda = LatentDirichletAllocation(n_components=n_components, max_iter=max_iter, verbose=1, learning_method='online', random_state=42).fit(self.tf)
        for topic_idx, topic in enumerate(self.lda.components_):
            print(f'Topic {topic_idx+1}:', end=' ')
            print(' '.join([self.tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))
        return self.lda.components_

#### 2.1.1 Before 1990s

In [None]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles_before_1990 = [row[0] for row in reader]
prepro_titles_before_1990 = [preprocess_text(title) for title in titles_before_1990]

In [None]:
print('Number of titles before 1990: '+str(len(titles_before_1990)))
print(titles_before_1990[:10])

Number of titles before 1990: 40000
['Object Model Capabilities For Distributed Object Management.', 'Distributed Object Management Technology.', 'Muffin: A Distributed Database Machine', 'Algebraical Optimization of FTA-Expressions', 'Wissensrepr&auml;sentation und Maschinelles Lernen', 'An Algebraic Characterization of STUF', 'Zur Systemarchitektur von LILOG', 'Mengenorientierte Auswertung von Anfragen in der Logikprogrammiersprache PROLOG', 'Definite Resolution over Constraint Languages', 'Dokumentation der Syntax der LILOG-Grammatik']


In [None]:
lda = LDA_tm()
lda.generate_tf(prepro_titles_before_1990)
_ = lda.get_topic(max_iter=10)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 1: problem note optimal functions method technical linear decision solution problems algorithm using
Topic 2: control new implementation digital optimal linear approach design theory using systems problems
Topic 3: software processing applications finite research parallel digital computer data design theory information
Topic 4: analysis application languages performance algorithms theory data decision computer networks linear design
Topic 5: programming simulation linear problems digital language computer languages approach parallel using design
Topic 6: design algorithm data information networks approach performance using parallel digital computer linear
Topic 7: computer using theory linear p

#### 2.1.2 From 1990 to 2009:


In [None]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles_from_1990_to_2009 = [row[0] for row in reader]

prepro_titles_from_1990_to_2009 = [preprocess_text(title) for title in titles_from_1990_to_2009]

In [None]:
print('Number of titles from 1990 to 2009: '+str(len(titles_from_1990_to_2009)))
print(titles_from_1990_to_2009[:10])

Number of titles from 1990 to 2009: 330317
['An Evaluation of Object-Oriented DBMS Developments: 1994 Edition.', 'DARWIN: On the Incremental Migration of Legacy Information Systems', 'Integrating Heterogeneous, Autonomous, Distributed Applications Using the DOM Prototype.', 'Integrating Object-Oriented Applications and Middleware with Relational Databases.', 'Towards a Transaction Management System for DOM.', "A 'RISC' Object Model for Object System Interoperation: Concepts and Applications.", 'MetaObject Protocol Concepts for a RISC Object Model.', 'Object Data Language Facilities for Multimedia Data Types.', 'Object Data Model Facilities for Multimedia Data Types.', 'Experiments with Dispatching in a Distributed Object System.']


In [None]:
lda.generate_tf(prepro_titles_from_1990_to_2009)
_ = lda.get_topic(max_iter=10)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 1: algorithm new linear problem algorithms optimal robust equations efficient detection optimization multiple
Topic 2: networks approach nonlinear network models problems neural wireless mobile evaluation scheduling robust
Topic 3: systems based distributed nonlinear linear robust control approach optimal adaptive evaluation detection
Topic 4: control analysis methods software development computing robust nonlinear optimal linear adaptive problems
Topic 5: applications scheme web power efficient wireless mobile new robust control networks evaluation
Topic 6: model performance image time graphs parallel digital evaluation algorithms robust scheduling optimal
Topic 7: using method dynamic simulat

#### 2.1.3 From 2010 onwards:



In [None]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles_from_2010 = [row[0] for row in reader]

prepro_titles_from_2010 = [preprocess_text(title) for title in titles_from_2010]


In [None]:
print('Number of titles from 2010 onwards: '+str(len(titles_from_2010)))
print(titles_from_2010[:10])

Number of titles from 2010 onwards: 825680
['Spectre Attacks: Exploiting Speculative Execution.', 'Computer Science Curricula 2013', 'Differences in productivity and impact across the different computer science subareas.', 'Klaus Tschira Stiftung gemeinn&uuml;tzige GmbH, KTS', 'Catchment classification by runoff behaviour with self-organizing maps (SOM)', 'Analysis of projected hydrological behavior of catchments based on signature indices', 'Ear Shape for Biometric Identification.', 'Multi-Threaded Implementation for Cryptography and Cryptanalysis.', 'Privacy-Preserving Authentication in Wireless Access Networks.', 'Private Key Cryptosystem.']


In [None]:
lda.generate_tf(prepro_titles_from_2010)
_ = lda.get_topic(max_iter=10)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 1: networks detection neural linear mobile novel fuzzy recognition computing images deep cloud
Topic 2: information framework time problem management scheduling energy dynamic optimal cloud hybrid algorithms
Topic 3: using method deep models optimal energy social machine algorithms learning hybrid feature
Topic 4: systems estimation study performance efficient robust evaluation tracking case improved nonlinear linear
Topic 5: image classification scheme equations prediction online research feature based methods deep nonlinear
Topic 6: based analysis optimization dynamic application power applications modeling hybrid methods feature cloud
Topic 7: learning approach algorithm design nonlinear dis

## 2.2 Combined Topic Models

New method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/). 

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs). 

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

num_ctm_topics = 10

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def load_text(text):
    sp = WhiteSpacePreprocessing(text, stopwords_language='english')
    preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()
    return preprocessed_documents, unpreprocessed_corpus, vocab

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#### 2.2.1 Before the 1990s:

In [None]:
preprocessed_documents_before_1990, unpreprocessed_corpus_before_1990, vocab_before_1990 = load_text(prepro_titles_before_1990)
tp = TopicModelDataPreparation("all-mpnet-base-v2")
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus_before_1990, text_for_bow=preprocessed_documents_before_1990)
ctm1 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm1.fit(training_dataset) 



Batches:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [393850/393850]	Train Loss: 33.66965636393436	Time: 0:00:05.628523: : 10it [00:55,  5.59s/it]


In [None]:
for i in range(0,10):
    print(f'Topic {i+1}:', end=' ')
    print(' '.join(ctm1.get_topic_lists(10)[i]))

Topic 1: system design data analysis using processing computer distributed image digital
Topic 2: information software science management research review development chemical new introduction
Topic 3: de und von fuumlr zur der des la die et
Topic 4: sets graphs set number classes properties finite boolean degrees types
Topic 5: control systems model optimal linear nonlinear theory estimation identification application
Topic 6: note problem technical letter problems editor sequential solution optimal machines
Topic 7: language programming recognition pattern languages program natural automatic machine approach
Topic 8: algorithm algorithms method parallel search efficient using binary computing matrix
Topic 9: network networks architecture performance simulation protocol computers local digital communications
Topic 10: logic propositional symbolic proof semantics calculus logics calculi modal deduction


In [None]:
ctm1.save(models_dir="/content/drive/MyDrive/Colab Notebooks/ML4NLP/EX6/ctm1")



#### 2.2.2 From 1990 to 2009

In [None]:
preprocessed_documents_from_1990_to_2009, unpreprocessed_corpus_from_1990_to_2009, vocab_from_1990_to_2009 = load_text(prepro_titles_from_1990_to_2009)
tp = TopicModelDataPreparation("all-mpnet-base-v2")
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus_from_1990_to_2009, text_for_bow=preprocessed_documents_from_1990_to_2009)
ctm2 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm2.fit(training_dataset)



Batches:   0%|          | 0/1628 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [3255170/3255170]	Train Loss: 37.991709522558835	Time: 0:00:44.319392: : 10it [07:21, 44.12s/it]


In [None]:
for i in range(0,10):
    print(f'Topic {i+1}:', end=' ')
    print(' '.join(ctm2.get_topic_lists(10)[i]))

Topic 1: systems control linear robust stability nonlinear feedback optimal class uncertain
Topic 2: graphs number graph trees complexity sets automata degree groups random
Topic 3: analysis study data molecular models functional human modeling brain dynamics
Topic 4: underwater feasibility terminal window incorporating handling positioning reactive nonstationary benchmark
Topic 5: problems problem method solution equations methods numerical optimization order solving
Topic 6: networks wireless mobile network sensor routing protocol performance multicast service
Topic 7: using based image classification recognition neural fuzzy images detection segmentation
Topic 8: information special review research web issue introduction technology computer science
Topic 9: system design development software decision process support implementation framework distributed
Topic 10: estimation power frequency channel circuit low channels blind array cmos


In [None]:
ctm2.save(models_dir="/content/drive/MyDrive/Colab Notebooks/ML4NLP/EX6/ctm2")



#### 2.2.3 From 2010 onwards

In [None]:
preprocessed_documents_from_2010, unpreprocessed_corpus_from_2010, vocab_from_2010 = load_text(prepro_titles_from_2010)
tp = TopicModelDataPreparation("all-mpnet-base-v2")
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus_from_2010, text_for_bow=preprocessed_documents_from_2010)
ctm3 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=10)
ctm3.fit(training_dataset)



Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/4084 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [8167350/8167350]	Train Loss: 45.14848432235374	Time: 0:01:48.291818: : 10it [18:05, 108.60s/it]


In [None]:
for i in range(0,10):
    print(f'Topic {i+1}:', end=' ')
    print(' '.join(ctm3.get_topic_lists(10)[i]))

Topic 1: finite equations differential equation approximation fractional solutions problems boundary numerical
Topic 2: systems control feedback stability consensus adaptive output sliding nonlinear multiagent
Topic 3: learning neural deep network machine prediction convolutional classification recognition using
Topic 4: wireless networks sensor allocation protocol vehicular access resource secure radio
Topic 5: cascade multi stage simplified redundancy adjustment buildings train marine window
Topic 6: model fuzzy decision chain approach group making process supply risk
Topic 7: image segmentation images feature matching color sparse fusion transform based
Topic 8: optimization algorithm power system scheduling swarm electric planning multiobjective energy
Topic 9: data land surface temperature water mapping soil satellite china forest
Topic 10: review special issue technology challenges role systematic editorial technologies research


In [None]:
ctm3.save(models_dir="/content/drive/MyDrive/Colab Notebooks/ML4NLP/EX6/ctm3")

