In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
content = 'The Seventh-day Adventist Church in Tonga, (Tongan: Siasi ʻAhofitu)\
is one of the smaller religious groups in the South Pacific island \
state of Tonga with a reported 3,853 members as of June 30, 2020, \
started by Seventh-day Adventist missionaries from the United States \
who visited in 1891 and settled in 1895. They set up schools but made \
very little progress in conversion, handicapped by dietary rules that \
prohibited popular local foods such as pork and shellfish, and that \
also banned tobacco, alcohol and kava. The church was revitalized \
in 1912 with renewed emphasis on evangelism. In 1922 it resumed its \
strategy of providing education, which resulted in an increase in \
conversions. After keeping a low profile during World War II (1939–45),\
the church grew quickly from 1950 to the 1970s. However, membership \
subsequently declined due to emigration and competition with other \
churches. The SDA of Tonga is part of the South Pacific Division of \
Seventh-day Adventists. It operates several schools in Tonga, \
and provides opportunities for further studies at \
Adventist institutions abroad.'
doc = nlp(content)
for sents in doc.sents:
    print(sents.text)

  from .autonotebook import tqdm as notebook_tqdm


The Seventh-day Adventist Church in Tonga, (Tongan: Siasi ʻAhofitu)is one of the smaller religious groups in the South Pacific island state of Tonga with a reported 3,853 members as of June 30, 2020, started by Seventh-day Adventist missionaries from the United States who visited in 1891 and settled in 1895.
They set up schools but made very little progress in conversion, handicapped by dietary rules that prohibited popular local foods such as pork and shellfish, and that also banned tobacco, alcohol and kava.
The church was revitalized in 1912 with renewed emphasis on evangelism.
In 1922 it resumed its strategy of providing education, which resulted in an increase in conversions.
After keeping a low profile during World War II (1939–45),the church grew quickly from 1950 to the 1970s.
However, membership subsequently declined due to emigration and competition with other churches.
The SDA of Tonga is part of the South Pacific Division of Seventh-day Adventists.
It operates several schoo

In [2]:
candidate_pos = ['NOUN', 'PROPN', 'VERB']
sentence_list = []

for sent in doc.sents:
    selected_words = []
    for token in sent:
        if token.pos_ in candidate_pos and token.is_stop is False:
            selected_words.append(token)
    sentence_list.append(selected_words)

print(sentence_list)

[[Seventh, day, Adventist, Church, Tonga, Tongan, Siasi, ʻAhofitu)is, groups, South, Pacific, island, state, Tonga, reported, members, June, started, Seventh, day, missionaries, United, States, visited, settled], [set, schools, progress, conversion, handicapped, rules, prohibited, foods, pork, shellfish, banned, tobacco, alcohol, kava], [church, revitalized, renewed, emphasis, evangelism], [resumed, strategy, providing, education, resulted, increase, conversions], [keeping, profile, World, War, II, church, grew, 1970s], [membership, declined, emigration, competition, churches], [SDA, Tonga, South, Pacific, Division, Seventh, day, Adventists], [operates, schools, Tonga, provides, opportunities, studies, institutions]]


In [3]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TopicExtraction():
    # Have to extract words from the document
    
    def __init__(s):
        s.d = 0.85 # damping coefficient, usually is .85
        s.min_diff = 1e-5 # convergence threshold
        s.steps = 10 # iteration steps
        s.node_weight = None # save keywords and its weight

    
    def stop_word(s, stop_words):  
        # Set stop words
        for word in STOP_WORDS.union(set(stop_words)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(s, doc, candidate_pos, lower):
        # Store those words only in cadidate_pos
        sentence_list = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentence_list.append(selected_words)
        return sentence_list
        
    def get_vocab(s, sentence_list):
        # Get all tokens
        vocab = OrderedDict()
        i = 0
        for sentence in sentence_list:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(s, window_size, sentence_list):
        # Build token_pairs from windows in sentence_list
        token_pairs = list()
        for sentence in sentence_list:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(s, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(s, vocab, token_pairs):
        # Get normalized matrix
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = s.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) 
        # this is ignore the 0 element in normalization
        
        return g_norm

    
    def get_keywords(s, number = 10):
        # Print top number keywords
        node_weight = OrderedDict(sorted(s.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(s, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stop_words=list()):
        # Main function to analyze text
        
        # Set stop words
        s.stop_word(stop_words)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentence_list
        sentence_list = s.sentence_segment(doc, candidate_pos, lower) 
        # list of list of words
        
        # Build vocabulary
        vocab = s.get_vocab(sentence_list)
        
        # Get token_pairs from windows
        token_pairs = s.get_token_pairs(window_size, sentence_list)
        
        # Get normalized matrix
        g = s.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(s.steps):
            pr = (1-s.d) + s.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < s.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        s.node_weight = node_weight

In [4]:
text = 'The Seventh-day Adventist Church in Tonga, (Tongan: Siasi ʻAhofitu)\
is one of the smaller religious groups in the South Pacific island \
state of Tonga with a reported 3,853 members as of June 30, 2020, \
started by Seventh-day Adventist missionaries from the United States \
who visited in 1891 and settled in 1895. They set up schools but made \
very little progress in conversion, handicapped by dietary rules that \
prohibited popular local foods such as pork and shellfish, and that \
also banned tobacco, alcohol and kava. The church was revitalized \
in 1912 with renewed emphasis on evangelism. In 1922 it resumed its \
strategy of providing education, which resulted in an increase in \
conversions. After keeping a low profile during World War II (1939–45),\
the church grew quickly from 1950 to the 1970s. However, membership \
subsequently declined due to emigration and competition with other \
churches. The SDA of Tonga is part of the South Pacific Division of \
Seventh-day Adventists. It operates several schools in Tonga, \
and provides opportunities for further studies at \
Adventist institutions abroad.'

topic_extraction = TopicExtraction()
topic_extraction.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=5, lower=False)
topic_extraction.get_keywords(5)

Tonga - 2.647211032073085
day - 1.6664887547967906
Seventh - 1.6562748411063142
church - 1.6026500000000001
Pacific - 1.4744065187292867
South - 1.3939242286978897
foods - 1.2346345131802723
