### Exercise 6 - Topic Modeling
### Discover Topics and Trends in Computer Science

##### Loading the data

In [2]:
!pip install contextualized-topic-models==2.3.0 &> /dev/null
!pip install pyLDAvis &> /dev/null


In [3]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm
import pyLDAvis.sklearn

  from collections import Iterable
  from collections import Mapping


In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

# # to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
# url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# # num_titles = 500000  # the (max)number of titles to load 
# #

# def load_gzip_file(url):
#     """Download Gzip-file."""
#     response = urllib.request.urlopen(url)
#     compressed_file = io.BytesIO(response.read())
#     decompressed_file = gzip.GzipFile(fileobj=compressed_file)
#     return decompressed_file

# def extract_titles(input_file, max_num=40000):
#     """Extract title and publication year of dblp papers, given as input file.
    
#     Divide the papers into 3 time periods. 
    
#     Collect max max_num papers per time period.
#     """
#     pairs_before_1990 = []
#     count_before_1990 = 0
#     pairs_from_1990_to_2009 = []
#     count_from_1990_to_2009 = 0
#     pairs_from_2010 = []
#     count_from_2010 = 0
#     got_title = False
#     for line in tqdm(input_file):
#         line_str = line.decode('utf-8')
#         if got_title: 
#             # we have a title and check for the corresponding year
#             year_result = re.search(r'<year>(.*)</year>', line_str)
#             if year_result:
#                 # we also have the year and thus save the title-year pair
#                 year = int(year_result.group(1))
#                 if year < 1990:
#                     pairs_before_1990.append((title, year))
#                     count_before_1990 += 1
#                 elif year < 2010:
#                     pairs_from_1990_to_2009.append((title, year))
#                     count_from_1990_to_2009 += 1
#                 else:
#                     pairs_from_2010.append((title, year))
#                     count_from_2010 += 1
#                 got_title = False
#         else:
#             # we have no title and search for title
#             result = re.search(r'<title>(.*)</title>', line_str)
#             if result:
#                 title = result.group(1)
#                 if len(title.split(' ')) < 3:  
#                     # only include titles with at least four words
#                     continue
#                 got_title = True
        
#         if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
#             return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010
    
#     return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

# def save_data(pairs, file_path):
#     with open(file_path, 'w') as fout:
#         writer = csv.writer(fout)
#         for pair in pairs:
#             writer.writerow(pair)

# in_file = load_gzip_file(url)
# pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
# save_data(pairs_before_1990, path_before_1990)
# save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
# save_data(pairs_from_2010, path_from_2010)

In [5]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Preprocessing data

In [7]:
import string
exclude = string.punctuation

# data preprocessing class
class Preprocessing:

  # lowercasing
  def convert_lowercase(self,text):
      text = text.lower()
      return text

  # removing html tags
  def remove_html_tags(self,text):
      re_html = re.compile('<.*?>')
      return re_html.sub(r'', text)

  # removing URLS
  def remove_url(self,text):
      re_url = re.compile('https?://\S+|www\.\S+')
      return re_url.sub('', text)

  # removing punctuations
  def remove_punc(self,text):
      return text.translate(str.maketrans('', '', exclude))

  # removing special characters
  def remove_special(self,text):
      x= re.sub(r'[^a-zA-Z ]', '', text)
      return x

  # removing digits
  def remove_digits(self,text):
      filtered_string = ''.join((x for x in text if not x.isdigit()))
      return filtered_string


  def preprocess(self,sent):
    sent = self.remove_html_tags(sent)
    sent = self.remove_url(sent)
    sent = self.remove_digits(sent)
    sent = self.remove_punc(sent)
    sent = self.remove_special(sent)
    sent = self.convert_lowercase(sent)
    return sent

  re_url = re.compile('https?://\S+|www\.\S+')


### Part 1 - Topic Modelling using LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 10

In [None]:
# function for obtaining the LDA components
def get_lda(path, num_lda_topics=num_lda_topics):
  with open(path) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

  prepro_titles = [Preprocessing().preprocess(title) for title in titles]

  #Now we turn the documents (or titles in this case) into a matrix feature representation.
  num_features = 10000
  tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
  tf = tf_vectorizer.fit_transform(prepro_titles)
  tf_feature_names = tf_vectorizer.get_feature_names_out()

  lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=10, learning_method='online', random_state=42, n_jobs = -1).fit(tf)

  for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

  return lda, tf, tf_vectorizer

### Before the 1990s:

In [None]:
# getting topics and visualizing them

lda_1990, dtm_tf_1990, tf_vectorizer_1990 = get_lda(path_before_1990)

pyLDAvis.enable_notebook()
plot = pyLDAvis.sklearn.prepare(lda_1990, dtm_tf_1990, tf_vectorizer_1990)
plot

Topic 0: applications finite solution binary computers trees equations arithmetic process computation test partial
Topic 1: analysis information networks logic stochastic detection synthesis programs processes automata function science
Topic 2: design data systems approach application model distributed processing graphs implementation development machine
Topic 3: network algorithms program structures machines review ii nonlinear efficient letter service testing
Topic 4: computer systems control using theory linear new parallel software based performance study
Topic 5: problem recognition systems sets languages sequential pattern decision methods set architecture class
Topic 6: note functions memory technical theorem chemical complexity logic properties representation editor modal
Topic 7: problems method structure use estimation introduction discrete classification comparison planning special calculus
Topic 8: algorithm models optimal time dynamic circuits research evaluation number au

  default_term_info = default_term_info.sort_values(


### From 1990 to 2009:

In [None]:
lda_2009, dtm_tf_2009, tf_vectorizer_2009 = get_lda(path_from_1990_to_2009)

pyLDAvis.enable_notebook()
plot = pyLDAvis.sklearn.prepare(lda_2009, dtm_tf_2009, tf_vectorizer_2009)
plot

Topic 0: using control linear nonlinear methods management equations detection optimal problems scheduling case
Topic 1: modeling fuzzy computer web dynamics molecular services frequency retrieval optical multimedia database
Topic 2: design method dynamic evaluation generalized feedback implementation sets set properties large high
Topic 3: adaptive software development classification human order prediction novel scheme memory engineering interaction
Topic 4: data study graphs learning recognition power communication online error codes automatic delay
Topic 5: model approach new image parallel programming equation optimization differential environment local genetic
Topic 6: algorithm models application algorithms efficient robust time simulation digital computing finite framework
Topic 7: analysis networks performance problem applications neural wireless mobile distributed stability sensor service
Topic 8: multiple theory functions structure processing support realtime decision functio

  default_term_info = default_term_info.sort_values(


### From 2010 onwards:

In [None]:
lda_2010, dtm_tf_2010, tf_vectorizer_2010 = get_lda(path_from_2010)

pyLDAvis.enable_notebook()
plot = pyLDAvis.sklearn.prepare(lda_2010, dtm_tf_2010, tf_vectorizer_2010)
plot

Topic 0: model image classification framework energy management algorithms scheduling research random virtual effects
Topic 1: learning networks study new performance wireless efficient deep problem machine case approach
Topic 2: problems online communication improved dynamics local software processing fault use measurement noise
Topic 3: systems novel fuzzy approach networks social stability realtime clustering visual decision graph
Topic 4: using design linear modeling distributed prediction computing fast distribution process review solutions
Topic 5: analysis method network dynamic power neural time sensor human feature graphs state
Topic 6: based algorithm multiple tracking functions complex architecture routing class order knowledge impact
Topic 7: data detection optimization application applications optimal hybrid methods equations stochastic sensing cloud
Topic 8: adaptive estimation nonlinear mobile scheme recognition robust images selection digital generalized internet
Topic 

  default_term_info = default_term_info.sort_values(


### Part 2 - Topic Modelling using Combined Topic Models (CTMs)

In [8]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')
num_ctm_topics = 10

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
def get_ctm(path,num_ctm_topics = num_ctm_topics):
  with open(path) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]
  
  # preprocessing data
  prepro_titles = [Preprocessing().preprocess(title) for title in titles]
  documents = [line.strip() for line in prepro_titles]
  stopwords = list(stop_words.words("english"))

  sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
  preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

  tp = TopicModelDataPreparation("all-mpnet-base-v2")
  training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

  ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=7)
  ctm.fit(training_dataset) # run the model
  
  for i in range(num_ctm_topics):
    print("Topic {}: {}".format(i, ctm.get_topic_lists(12)[i]))

  return ctm, tp, training_dataset


### Before the 1990s:

In [None]:
ctm, tp, training_dataset = get_ctm(path_before_1990)

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=num_ctm_topics)

ctm_pd = pyLDAvis.prepare(**lda_vis_data)
pyLDAvis.display(ctm_pd)

Batches:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: [10/10]	 Seen Samples: [393890/393890]	Train Loss: 33.3722550186521	Time: 0:00:10.453664: : 10it [01:48, 10.83s/it]
Sampling: [20/20]: : 20it [02:58,  8.95s/it]


Topic 0: ['data', 'using', 'recognition', 'analysis', 'processing', 'image', 'pattern', 'structures', 'approach', 'chemical', 'representation', 'application']
Topic 1: ['theory', 'models', 'logic', 'calculus', 'theorem', 'proof', 'modal', 'order', 'set', 'propositional', 'logics', 'arithmetic']
Topic 2: ['system', 'design', 'systems', 'computer', 'distributed', 'database', 'expert', 'management', 'support', 'development', 'software', 'data']
Topic 3: ['uumlber', 'et', 'de', 'und', 'von', 'zur', 'du', 'fuumlr', 'der', 'surfaces', 'die', 'additional']
Topic 4: ['algorithm', 'algorithms', 'parallel', 'method', 'efficient', 'linear', 'equations', 'fast', 'search', 'solution', 'computing', 'matrix']
Topic 5: ['information', 'review', 'research', 'intelligence', 'artificial', 'technology', 'introduction', 'new', 'science', 'report', 'future', 'engineering']
Topic 6: ['control', 'optimal', 'linear', 'systems', 'model', 'technical', 'problems', 'note', 'time', 'adaptive', 'stochastic', 'estima

Sampling: [10/10]: : 10it [01:27,  8.79s/it]


### From 1990 to 2009:

In [None]:
ctm, tp, training_dataset = get_ctm(path_from_1990_to_2009)

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=num_ctm_topics)

ctm_pd = pyLDAvis.prepare(**lda_vis_data)
pyLDAvis.display(ctm_pd)




Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1628 [00:00<?, ?it/s]

Epoch: [7/7]	 Seen Samples: [2278787/2278787]	Train Loss: 38.0975434142404	Time: 0:01:31.535943: : 7it [10:27, 89.65s/it]
Sampling: [20/20]: : 20it [22:42, 68.14s/it]


Topic 0: ['image', 'detection', 'using', 'recognition', 'images', 'estimation', 'based', 'speech', 'classification', 'segmentation', 'feature', 'automatic']
Topic 1: ['information', 'web', 'research', 'technology', 'knowledge', 'management', 'development', 'software', 'case', 'services', 'review', 'user']
Topic 2: ['graphs', 'number', 'graph', 'trees', 'random', 'memory', 'planar', 'minimum', 'size', 'maximum', 'times', 'cortex']
Topic 3: ['special', 'introduction', 'issue', 'editorial', 'guest', 'der', 'editors', 'von', 'und', 'section', 'de', 'conference']
Topic 4: ['fuzzy', 'neural', 'system', 'approach', 'network', 'algorithm', 'based', 'learning', 'algorithms', 'genetic', 'design', 'optimization']
Topic 5: ['networks', 'wireless', 'mobile', 'performance', 'sensor', 'routing', 'protocol', 'distributed', 'communications', 'scheme', 'access', 'communication']
Topic 6: ['systems', 'control', 'robust', 'adaptive', 'nonlinear', 'linear', 'stability', 'feedback', 'estimation', 'timevaryi

Sampling: [10/10]: : 10it [11:45, 70.57s/it]
  default_term_info = default_term_info.sort_values(


### From 2010 onwards:

In [10]:
ctm, tp, training_dataset = get_ctm(path_from_2010)

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=num_ctm_topics)

ctm_pd = pyLDAvis.prepare(**lda_vis_data)
pyLDAvis.display(ctm_pd)



Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]



Batches:   0%|          | 0/4094 [00:00<?, ?it/s]

Epoch: [7/7]	 Seen Samples: [5730697/5730697]	Train Loss: 45.16894497223578	Time: 0:03:23.013758: : 7it [23:48, 204.14s/it]
Sampling: [20/20]: : 20it [53:53, 161.69s/it]


Topic 0: ['equations', 'differential', 'finite', 'methods', 'approximation', 'equation', 'generalized', 'convergence', 'numerical', 'functions', 'order', 'solutions']
Topic 1: ['computing', 'smart', 'cloud', 'internet', 'special', 'applications', 'security', 'things', 'issue', 'editorial', 'iot', 'challenges']
Topic 2: ['surface', 'imaging', 'temperature', 'measurement', 'mapping', 'magnetic', 'land', 'radar', 'measurements', 'resolution', 'calibration', 'brain']
Topic 3: ['learning', 'machine', 'using', 'neural', 'deep', 'analysis', 'prediction', 'data', 'network', 'model', 'classification', 'approach']
Topic 4: ['window', 'equipment', 'inspired', 'incremental', 'redundancy', 'evolving', 'train', 'weighting', 'selfadaptive', 'employing', 'nets', 'malicious']
Topic 5: ['information', 'social', 'case', 'online', 'study', 'technology', 'media', 'software', 'use', 'development', 'knowledge', 'role']
Topic 6: ['algorithm', 'optimization', 'fuzzy', 'decision', 'swarm', 'problem', 'multiobje

Sampling: [10/10]: : 10it [26:48, 160.81s/it]
  default_term_info = default_term_info.sort_values(


Thank you!