In [127]:
# Splitting text data and storing them in a list (of articles)
import io
docs = io.open("raw_data.txt", mode="r", encoding="utf-8", errors="ignore").read().split('\n') # list of strings 
titles = [docs[i] for i in range(len(docs)) if i % 2 == 0] # list of string titles
contents = [docs[i] for i in range(len(docs)) if i % 2 == 1] # list of string contents
print(contents[500])

In numerical analysis, the Shanks transformation is a non-linear series acceleration method to increase the rate of convergence of a sequence. This method is named after Daniel Shanks, who rediscovered this sequence transformation in 1955. It was first derived and published by R. Schmidt in 1941.[1]Milton D. Van Dyke (1975) Perturbation methods in fluid mechanics, p. 202.As an example, consider the slowly convergent series[3]The generalized kth-order Shanks transformation is given as the ratio of the determinants:[4]The generalized Shanks transformation is closely related to Padé approximants and Padé tables.[4]


In [128]:
# Preprocessing/ cleaning the data
import re
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

# remove text between parenthesis
# contents = list(map(lambda x: re.sub(r"\(.*\)","",x), contents))

# remove all digits from text
contents = list(map(lambda x: re.sub(r"\d+","",x), contents))

stop = set(stopwords.words('english')) # set of stopwords
lemma = WordNetLemmatizer()
def clean(doc):
    # remove stopwords and words that are too short
    return [lemma.lemmatize(i, 'v') for i in word_tokenize(doc) if i not in stop and len(i) > 2]
cleaned = [clean(page.lower()) for page in contents]

print(cleaned[2])

['concept', 'angle', 'line', 'plane', 'pair', 'two', 'line', 'two', 'plan', 'line', 'plane', 'space', 'generalise', 'arbitrary', 'dimension', 'generalisation', 'first', 'discuss', 'jordan', 'pair', 'flats', 'euclidean', 'space', 'arbitrary', 'dimension', 'one', 'define', 'set', 'mutual', 'angle', 'invariant', 'isometric', 'transformation', 'euclidean', 'space', 'flats', 'intersect', 'shortest', 'distance', 'one', 'invariant', 'angle', 'call', 'canonical', 'principal', 'concept', 'angle', 'generalise', 'pair', 'flats', 'finite-dimensional', 'inner', 'product', 'space', 'complex', 'numbers.andbeing', 'relate', 'canonical', 'coordinate', 'basic', 'vectors', 'may', 'call', 'canonical.the', 'principal', 'angle', 'vectors', 'define', 'recursively', 'viaif', 'largest', 'angle', 'zero', 'one', 'subspace', 'subset', 'other.if', 'smallest', 'angle', 'zero', 'subspaces', 'intersect', 'least', 'line.the', 'number', 'angle', 'equal', 'zero', 'dimension', 'space', 'two', 'subspaces', 'intersect']


In [130]:
# Building word dicitonary
from gensim import corpora
# create the term dictionary of our corpus; terms are unique; each term is assigned an index
dictionary = corpora.Dictionary(cleaned)
print(dictionary)
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)
stoplist = set('also use make people know many call include part find become like mean often different usually take wikt come give well get since type list say change see refer actually iii aisne kinds pas ask would way something need things want every str'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
print(dictionary)
dictionary.filter_n_most_frequent(50)
print(dictionary)

# This saves the dictionary to the local disk
dictionary.save_as_text('./dictionary.txt')


Dictionary(25741 unique tokens: ['abelian', 'addition', 'also', 'argument', 'article']...)
Dictionary(6621 unique tokens: ['abelian', 'addition', 'also', 'argument', 'article']...)
Dictionary(6583 unique tokens: ['abelian', 'addition', 'argument', 'article', 'attribution/share-alike']...)
Dictionary(6533 unique tokens: ['abelian', 'addition', 'argument', 'article', 'attribution/share-alike']...)


In [131]:
# Creating document-term matrix from vocabulary (dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned]
print(len(doc_term_matrix))
print(doc_term_matrix[693])

694
[(1, 1), (7, 1), (11, 1), (14, 1), (22, 2), (24, 2), (30, 3), (34, 29), (39, 1), (40, 2), (43, 1), (46, 1), (61, 2), (64, 1), (65, 1), (74, 2), (79, 1), (84, 2), (86, 2), (100, 1), (104, 1), (111, 1), (117, 2), (123, 3), (125, 1), (126, 1), (129, 1), (132, 3), (134, 7), (137, 1), (138, 1), (141, 3), (154, 4), (164, 1), (165, 1), (172, 3), (175, 1), (176, 2), (183, 1), (198, 1), (214, 3), (217, 1), (222, 1), (228, 1), (233, 1), (253, 1), (255, 2), (262, 2), (270, 1), (273, 2), (275, 2), (281, 1), (288, 2), (292, 3), (303, 1), (321, 1), (324, 1), (329, 1), (339, 2), (342, 1), (356, 6), (357, 1), (376, 1), (381, 1), (385, 1), (391, 1), (392, 1), (395, 2), (399, 1), (404, 6), (409, 1), (419, 4), (423, 4), (438, 2), (459, 1), (461, 1), (464, 2), (467, 1), (490, 1), (491, 2), (500, 4), (511, 24), (518, 3), (519, 1), (525, 2), (526, 1), (529, 1), (533, 1), (535, 1), (539, 2), (548, 3), (554, 3), (561, 1), (564, 1), (578, 1), (595, 1), (617, 1), (627, 1), (631, 1), (654, 1), (665, 1), (668

In [132]:
# Training LDA model
from gensim.models.ldamodel import LdaModel as Lda

ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary)

# Showing the 15 identified topics after the model is trained, where top 10 key terms are listed for each topic
for topic in ldamodel.print_topics(num_topics=15, num_words=10):
    print(topic[0]+1, " ", topic[1],"\n")

1   0.005*"algorithm" + 0.005*"cordic" + 0.004*"equations" + 0.004*"methods" + 0.004*"root" + 0.004*"vectors" + 0.004*"product" + 0.004*"error" + 0.004*"polynomial" + 0.003*"coordinate" 

2   0.007*"vectors" + 0.006*"methods" + 0.006*"equations" + 0.005*"dual" + 0.005*"..." + 0.004*"orthogonal" + 0.004*"determinant" + 0.004*"map" + 0.004*"matrices" + 0.004*"hilbert" 

3   0.005*"affine" + 0.005*"line" + 0.005*"solution" + 0.005*"model" + 0.004*"methods" + 0.004*"product" + 0.004*"equations" + 0.004*"computer" + 0.003*"coordinate" + 0.003*"data" 

4   0.012*"group" + 0.011*"product" + 0.009*"matrices" + 0.006*"vectors" + 0.005*"line" + 0.004*"equations" + 0.004*"map" + 0.004*"kernel" + 0.004*"transformation" + 0.004*"zero" 

5   0.008*"coordinate" + 0.007*"equations" + 0.006*"line" + 0.006*"matrices" + 0.006*"equation" + 0.006*"ring" + 0.005*"solution" + 0.005*"finite" + 0.005*"row" + 0.005*"vectors" 

6   0.005*"product" + 0.005*"cordic" + 0.004*"group" + 0.004*"line" + 0.004*"manifold

In [133]:
# Clustering documents based on topics extracted from LDA model 
from operator import itemgetter
def cluster(doc_term_matrix, num):
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    result = [[] for i in range(num)]
    for k,topic in enumerate(doc_topics):
        # Some articles do not have a topic
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            result[topic[0][0]].append(k)
    for k in range(len(result)):
        print('Articles(ID) in Cluster ' + str(k+1) + ': ' + ', '.join(map(str, result[k])))
        print()
    return result
cluster_result = cluster(doc_term_matrix, 15)

Articles(ID) in Cluster 1: 7, 49, 89, 103, 105, 111, 114, 123, 130, 131, 136, 137, 156, 167, 175, 179, 181, 182, 184, 185, 188, 195, 196, 204, 210, 212, 216, 222, 226, 231, 233, 245, 246, 251, 274, 282, 284, 293, 296, 300, 307, 308, 323, 324, 333, 334, 339, 347, 358, 361, 364, 370, 372, 394, 398, 414, 430, 437, 457, 464, 498, 499, 513, 519, 523, 526, 527, 529, 541, 547, 589, 596, 602, 614, 621, 627, 631, 632, 653, 661, 668

Articles(ID) in Cluster 2: 17, 46, 51, 56, 58, 66, 80, 83, 118, 121, 129, 166, 238, 244, 253, 287, 294, 312, 337, 353, 368, 406, 411, 469, 478, 532, 542, 548, 551, 565, 579, 591, 593, 617, 622, 630, 633, 643, 687

Articles(ID) in Cluster 3: 26, 35, 60, 84, 99, 113, 155, 164, 165, 170, 171, 180, 193, 243, 272, 306, 316, 317, 345, 360, 378, 410, 431, 441, 481, 503, 585, 607, 620, 625, 640, 641, 659, 674, 678, 682

Articles(ID) in Cluster 4: 2, 9, 27, 43, 67, 82, 90, 104, 124, 125, 141, 169, 172, 174, 177, 183, 197, 199, 209, 220, 227, 241, 250, 259, 263, 266, 275, 280

In [134]:
# Showing the exact document titles in each cluster
for k in range(len(cluster_result)):
    print('Articles in Cluster ' + str(k+1) + ': ' + ', '.join(map(lambda x: titles[x], cluster_result[k])))
    print()

Articles in Cluster 1: Truncation error, Weyr canonical form, CORDIC, Orthant, De Casteljau's algorithm, Remez algorithm, Digital Library of Mathematical Functions, Adaptive stepsize, CORDIC, Trigonometric tables, De Boor's algorithm, Finite volume method, CORDIC, Signal-flow graph, Numerical analysis, The Nine Chapters on the Mathematical Art, Numerical model of the Solar System, Linear programming, Weakened weak form, Movable cellular automaton, Computational complexity, Fangcheng (mathematics), Material point method, Algorithm, Surrogate model, CORDIC, Entanglement-assisted stabilizer formalism, CORDIC, Pseudospectral knotting method, CORDIC, Engineering, Condition number, Legendre pseudospectral method, Computational science, Numerical stability, Chebyshev nodes, Graphics processing unit, Sparse grid, CORDIC, Lattice reduction, Digital Library of Mathematical Functions, CORDIC, Relative change and difference, CORDIC, Numerical methods in fluid mechanics, CORDIC, Minimax approximati

In [135]:
term_topics = ldamodel.get_term_topics('convex', minimum_probability=0.000001)
print(term_topics)

[(0, 0.00057868654), (1, 0.00045104639), (2, 0.00084757595), (3, 0.0002453932), (4, 0.00051466381), (5, 0.0016234117), (6, 0.00060021167), (7, 0.00040596904), (8, 0.00049059215), (9, 9.2905306e-05), (10, 0.00040188272), (11, 0.00017894359), (12, 0.00018584277), (13, 0.00058993214), (14, 0.00035637923)]


In [136]:
# Getting related documents based on a term 
def get_related_documents(term, top, doc_term_matrix):
    print('------- Top', top, 'articles related to',term,'-------')
    related_docs = []
    doc_topics = ldamodel.get_document_topics(doc_term_matrix, minimum_probability=0.20)
    term_topics = ldamodel.get_term_topics(term, minimum_probability=0.000001)
    term_topics.sort(key = itemgetter(1), reverse=True)
    for k,topic in enumerate(doc_topics):
        if topic:
            topic.sort(key = itemgetter(1), reverse=True)
            if topic[0][0] == term_topics[0][0]:
                related_docs.append((k,topic[0][1]))
    related_docs.sort(key = itemgetter(1), reverse=True)
    result = []
    for j,doc in enumerate(related_docs):
        print(titles[doc[0]],"\n",doc[1],"\n")   
        result.append(titles[doc[0]])
        if j == top - 1:
            break
related_docs = get_related_documents('convex', 7, doc_term_matrix)


------- Top 7 articles related to convex -------
Bernstein polynomial 
 0.99451 

Intersection curve 
 0.994167 

Radial basis function 
 0.988477 

Manifold 
 0.986637 

Integer points in convex polyhedra 
 0.984444 

Absolutely convex set 
 0.983626 

Truncation 
 0.973333 



In [138]:
def get_theme(doc, cluster_result):
    doc_id = titles.index(doc)
    if doc_id == -1:
        print('Document not found.')
        return
    for i, cluster in enumerate(cluster_result):
        if doc_id in cluster:
            return i+1
    return 0
cluster_num = get_theme('Absolutely convex set', cluster_result)
print(cluster_num)

6


In [196]:
# Implementing tf-idf model; the only information needed from the previous part is the doc_term_matrix
from gensim.models import TfidfModel, LsiModel
tfidf_model = TfidfModel(doc_term_matrix, dictionary = dictionary)
print(tfidf_model)
vector = tfidf_model[doc_term_matrix[0]]
print(vector[0])


TfidfModel(num_docs=694, num_nnz=169581)
(0, 0.045342656413852538)


In [150]:
# Implementing LSI model; the only information needed from the previous part is the doc_term_matrix
lsi_model = LsiModel(doc_term_matrix, id2word=dictionary)
print(lsi_model)

LsiModel(num_terms=6533, num_topics=200, decay=1.0, chunksize=20000)


In [171]:
# Creating the similarity matrix from simple bag-of-words model (# of documents * # of documents)
from gensim import similarities

index = similarities.MatrixSimilarity(doc_term_matrix, num_features=len(dictionary))
print(len(index[doc_term_matrix[693]])) # 694 * 694 matrix

694


In [172]:
# Training tf-idf model from bag-of-word dataset
model_tfidf = TfidfModel(doc_term_matrix, id2word=dictionary, normalize=False)

In [187]:
# Applying tf-idf model to all vectors
from gensim.corpora import MmCorpus
MmCorpus.serialize('./corpus_tfidf.mm', model_tfidf[doc_term_matrix], progress_cnt=100)

In [188]:
corpus_tfidf = MmCorpus('./corpus_tfidf.mm') # Loading back the corpus file after applying tf-idf
model_lsi = LsiModel(corpus_tfidf, num_topics=15, id2word=dictionary)
# Applying LSI model to all vectors
index = similarities.MatrixSimilarity(model_lsi[corpus_tfidf], num_features=len(dictionary))
print(index)
index.save('./lsi_index.mm') # Saving the similarity matrix to a local matrix market file named './lsi_model.mm'

MatrixSimilarity<694 docs, 6533 features>


In [195]:
# Loading the similarity matrix back from the local file
similarity_matrix = similarities.MatrixSimilarity.load('./lsi_index.mm')
print(len(similarity_matrix))

694
