## Learning Title Topics using Latents Dirichlet Allocation.

In [2]:
%matplotlib inline
import re
from collections import defaultdict
import numpy as np
from sklearn.decomposition import PCA, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

### Step 0) Data Cleaning. You can either run the following cell or directly run the script prep.py in terminal with the dataset path and stopwords_english path as arguments.

In [None]:
# rtitle = re.compile(r'^#\*\s*(.+)')
# stemmer = SnowballStemmer('english')
# titles = []
# stopwords = []
# parsed_titles = []

# with open('stopwords_english.txt') as fsw:
#     for word in fsw.readlines():
#         word = word.strip('\n')
#         stopwords.append(word)
# fsw.close()

# with open('publications.txt') as fin:
#     for line in fin.readlines():
#         line = line.strip('\n')
#         mtitle = rtitle.match(line)
#         ## if it is a title string
#         if mtitle:
#             title = mtitle.group(1).lower()
#             titles.append(title[:-1])
# fin.close()

# fout = open('titles_prep.txt', 'w+')
# for i, words in enumerate(titles):
#     words = words.split(" ")
#     words = [re.match('[a-zA-Z0-9]+', stemmer.stem(word)).group() for word in words if re.match('[a-zA-Z0-9]+', stemmer.stem(word)) is not None]
#     words = ['NUM' if re.match('[0-9]+', word) is not None else word for word in words]
#     words = list(filter(None, ["" if word in stopwords else word for word in words]))
#     words = ' '.join(words)
#     fout.write(words + '\n')
#     parsed_titles.append(words)
#     if i % 100000 == 0 :
#         print(i)
# fout.close()

In [2]:
load_and_process_data('publications.txt', 'stopwords_english.txt')

===== raw dataset loaded =====
===== stopwords list loaded =====
===== start to parse text =====
===== 0 titles are parsed successfully =====
===== 100000 titles are parsed successfully =====
===== 200000 titles are parsed successfully =====
===== 300000 titles are parsed successfully =====
===== 400000 titles are parsed successfully =====
===== 500000 titles are parsed successfully =====
===== 600000 titles are parsed successfully =====
===== 700000 titles are parsed successfully =====
===== parsing completed. parsed data was saved in file preprocessed.txt, please load this as the input for your model. End. =====


#### Step 1) Load the preprocessed dataset and convert it to word count vectors.

In [3]:
"""
*** load dataset and convert bag of words represenetations.***
"""
prep_words = []
with open('preprocessed.txt') as f:
    for line in f.readlines():
        line = line.strip('\n')
        prep_words.append(line)
f.close()

In [4]:
vectorizer = CountVectorizer(min_df=800)
X = vectorizer.fit(prep_words)

In [5]:
X_matrix = X.transform(prep_words)

In [8]:
len(X.vocabulary_)

1009

#### Step 2) Now apply LDA to the dataset with different number of topics = 10, 20, 50. In each experimetn, print the top 10 words in each topic.

In [9]:
components_num = [10, 20, 50]

In [14]:
"""
*** LDA ***
"""
lda = LatentDirichletAllocation(n_components=10)
id_topic = lda.fit_transform(X_matrix)
id_topic



array([[0.03333698, 0.03334105, 0.03333333, ..., 0.03333333, 0.03333662,
        0.03334231],
       [0.02      , 0.02      , 0.02000366, ..., 0.02      , 0.22      ,
        0.42      ],
       [0.02      , 0.02000057, 0.02      , ..., 0.02      , 0.02      ,
        0.02      ],
       ...,
       [0.13291227, 0.011112  , 0.01111361, ..., 0.1222222 , 0.25696928,
        0.01111651],
       [0.36666667, 0.03333333, 0.03333333, ..., 0.03333333, 0.36666666,
        0.03333333],
       [0.02000244, 0.02000955, 0.02      , ..., 0.42      , 0.02      ,
        0.02      ]])

In [16]:
vocab = vectorizer.get_feature_names()

In [17]:
topic_words = {}

for topic, comp in enumerate(lda.components_):
    word_idx = np.argsort(comp)[::-1][:10]
    topic_words[topic] = [vocab[i] for i in word_idx]

In [18]:
topic_words

{0: ['algorithm',
  'imag',
  'multi',
  'detect',
  'framework',
  'fuzzi',
  'video',
  'general',
  'logic',
  'pattern'],
 1: ['use',
  'estim',
  'manag',
  'web',
  'system',
  'process',
  'generat',
  'analysi',
  'predict',
  'servic'],
 2: ['method',
  'analysi',
  'evalu',
  'function',
  'program',
  'recognit',
  'select',
  'languag',
  'filter',
  'parallel'],
 3: ['comput',
  'inform',
  'multipl',
  'visual',
  'system',
  'model',
  'technolog',
  'map',
  'environ',
  'approxim'],
 4: ['network',
  'approach',
  'applic',
  'optim',
  'perform',
  'distribut',
  'high',
  'system',
  'scheme',
  'neural'],
 5: ['learn',
  'effect',
  'interact',
  'model',
  'adapt',
  'user',
  'digit',
  'semant',
  'virtual',
  'human'],
 6: ['design',
  'system',
  'dynam',
  'simul',
  'model',
  'architectur',
  'two',
  'search',
  'classif',
  'non'],
 7: ['problem',
  'studi',
  'new',
  'graph',
  'effici',
  'implement',
  'case',
  'set',
  'toward',
  'optim'],
 8: ['bas

In [18]:
lda20 = LatentDirichletAllocation(n_components=20)
lda20.fit_transform(X_matrix)
lda50 = LatentDirichletAllocation(n_components=50)
lda50.fit_transform(X_matrix)



array([[0.00666667, 0.00666667, 0.00666667, ..., 0.00666667, 0.00666667,
        0.00666667],
       [0.004     , 0.004     , 0.004     , ..., 0.004     , 0.004     ,
        0.004     ],
       [0.004     , 0.004     , 0.004     , ..., 0.004     , 0.004     ,
        0.004     ],
       ...,
       [0.00222222, 0.00222222, 0.11333333, ..., 0.00222222, 0.00222222,
        0.00222222],
       [0.00666667, 0.00666667, 0.00666667, ..., 0.00666667, 0.00666667,
        0.00666667],
       [0.004     , 0.004     , 0.004     , ..., 0.004     , 0.004     ,
        0.004     ]])

In [20]:
id_topic20 = lda20.fit_transform(X_matrix)

topic_words20 = {}

for topic, comp in enumerate(lda20.components_):  
    word_idx = np.argsort(comp)[::-1][:10]
    topic_words20[topic] = [vocab[i] for i in word_idx]



In [22]:
topic_words20

{0: ['data',
  'structur',
  'automat',
  'mine',
  'field',
  'impact',
  'text',
  'devic',
  'scalabl',
  'medic'],
 1: ['comput',
  'environ',
  'parallel',
  'activ',
  'collabor',
  'statist',
  'cooper',
  'de',
  'grid',
  'cloud'],
 2: ['process',
  'distribut',
  'visual',
  'map',
  'product',
  'oper',
  'resourc',
  'bound',
  'energi',
  'queri'],
 3: ['applic',
  'detect',
  'recognit',
  'fuzzi',
  'scheme',
  'secur',
  'robust',
  'social',
  'speech',
  'signal'],
 4: ['algorithm',
  'design',
  'problem',
  'dynam',
  'estim',
  'function',
  'graph',
  'linear',
  'filter',
  'pattern'],
 5: ['imag',
  'multi',
  'improv',
  'robot',
  'measur',
  'video',
  'local',
  'space',
  'segment',
  'retriev'],
 6: ['adapt',
  'perform',
  'effici',
  'code',
  'effect',
  'power',
  'mobil',
  'program',
  'interact',
  'complex'],
 7: ['network',
  'studi',
  'wireless',
  'sensor',
  'neural',
  'generat',
  'implement',
  'theori',
  'combin',
  'vector'],
 8: ['servi

In [21]:
id_topic50 = lda50.fit_transform(X_matrix)

topic_words50 = {}

for topic, comp in enumerate(lda50.components_): 
    word_idx = np.argsort(comp)[::-1][:10]
    topic_words50[topic] = [vocab[i] for i in word_idx]



In [23]:
topic_words50

{0: ['time',
  'linear',
  'nonlinear',
  'ad',
  'radio',
  'hoc',
  'industri',
  'packet',
  'regress',
  'receiv'],
 1: ['data',
  'structur',
  'complex',
  'automat',
  'mine',
  'text',
  'stream',
  'medic',
  'market',
  'media'],
 2: ['develop',
  'strategi',
  'resourc',
  'project',
  'chang',
  'valu',
  'educ',
  'valid',
  'china',
  'motor'],
 3: ['distribut',
  'robust',
  'non',
  'product',
  'stochast',
  'memori',
  'cellular',
  'partit',
  'standard',
  'boundari'],
 4: ['object',
  'two',
  'approxim',
  'flow',
  'systems',
  'color',
  'asymptot',
  'augment',
  'move',
  'altern'],
 5: ['environ',
  'constraint',
  'cooper',
  'fade',
  'complet',
  'presenc',
  'consist',
  'urban',
  'transact',
  'multius'],
 6: ['parallel',
  'databas',
  'state',
  'motion',
  'larg',
  'fault',
  'block',
  'depend',
  'numer',
  'spars'],
 7: ['featur',
  'type',
  'condit',
  'evolut',
  'train',
  'respons',
  'mode',
  'dual',
  'higher',
  'strong'],
 8: ['domain',

As we increase the number of components from 10 to 50, what we do is simply adding topics to the model. And we are getting more topics with different words. 

### Problem 3
Now apply PCA to the same dataset with different number of principle components 10, 20, 50. Also print out top 10 words for each component.

In [1]:
def topwords_pca(cv, pca, num_topwords):
    """
    print the top words based on the eigenvectors 
    """
    Topwords = []
    vocab = {v: k for k, v in cv.vocabulary_.items()}
    eigenvectors = pca.components_
    for k,ev in enumerate(eigenvectors):
        ev_indices = np.argsort(- np.abs(ev))[:num_topwords]
        ev_topwords =', '.join([vocab[ind] for ind in ev_indices])
        Topwords.append(ev_topwords)
        print("Component : %d, topwords : %s" % (k, ev_topwords))


In [16]:
def run_svd(num_comp, data, matrix):
    svd = TruncatedSVD(n_components = num_comp)
    svd.fit(matrix)
    return topwords_pca(data, svd, num_topwords=10)

In [17]:
run_svd(10, X, X_matrix)

Component : 0, topwords : system, use, model, network, base, algorithm, analysi, data, design, control
Component : 1, topwords : system, network, use, model, algorithm, wireless, neural, sensor, data, imag
Component : 2, topwords : network, model, use, wireless, system, sensor, neural, imag, mobil, ad
Component : 3, topwords : model, use, network, base, imag, system, algorithm, detect, wireless, applic
Component : 4, topwords : base, algorithm, network, use, model, system, data, analysi, imag, optim
Component : 5, topwords : algorithm, base, problem, optim, data, model, comput, genet, analysi, parallel
Component : 6, topwords : data, analysi, base, applic, algorithm, use, comput, approach, perform, design
Component : 7, topwords : analysi, data, design, perform, control, comput, algorithm, system, mine, manag
Component : 8, topwords : design, analysi, comput, approach, applic, algorithm, data, control, system, base
Component : 9, topwords : comput, design, control, approach, data, imag

In [18]:
run_svd(20, X, X_matrix)

Component : 0, topwords : system, use, model, network, base, algorithm, analysi, data, design, control
Component : 1, topwords : system, network, use, model, algorithm, wireless, neural, sensor, data, imag
Component : 2, topwords : network, model, use, wireless, system, sensor, neural, imag, mobil, ad
Component : 3, topwords : model, use, network, base, imag, system, algorithm, detect, wireless, applic
Component : 4, topwords : base, algorithm, network, use, model, system, data, analysi, method, optim
Component : 5, topwords : algorithm, base, problem, optim, data, model, comput, genet, analysi, parallel
Component : 6, topwords : data, analysi, base, applic, algorithm, use, comput, design, approach, model
Component : 7, topwords : analysi, data, design, algorithm, perform, control, system, applic, mine, comput
Component : 8, topwords : design, analysi, comput, applic, approach, algorithm, control, system, data, base
Component : 9, topwords : comput, design, control, data, applic, imag,

In [19]:
run_svd(50, X, X_matrix)

Component : 0, topwords : system, use, model, network, base, algorithm, analysi, data, design, control
Component : 1, topwords : system, network, use, model, algorithm, wireless, neural, sensor, data, imag
Component : 2, topwords : network, model, use, wireless, system, sensor, neural, imag, mobil, ad
Component : 3, topwords : model, use, network, base, imag, system, algorithm, detect, wireless, applic
Component : 4, topwords : base, algorithm, network, use, model, system, data, analysi, method, optim
Component : 5, topwords : algorithm, base, problem, optim, data, model, comput, genet, analysi, parallel
Component : 6, topwords : data, analysi, base, applic, algorithm, use, comput, design, approach, model
Component : 7, topwords : analysi, data, design, perform, algorithm, control, system, mine, applic, comput
Component : 8, topwords : design, analysi, comput, applic, approach, algorithm, control, system, data, base
Component : 9, topwords : comput, design, control, data, applic, imag,

As we increase the number of components, the result of pca seems better because there are more topics appearing rather than using different combinations of a group of indentical words to form topics. 

### Problem 4

To conclude, I think LDA is much better than PCA because there lots of words that occur in many topics. This result is really confusing as it cannot clearly identify different topic which also makes furthre analysis harder. 