In [None]:
# Imports
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string as str
import math

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import files
upload = files.upload()

Saving cambodia.txt to cambodia.txt
Saving covid.txt to covid.txt


In [None]:
# Get sentences and words after preprocessing

def get_tokenized_sents(text):

    tokenized_sents = []
    
    # remove frequent words and punctuations
    unwanted_words = stopwords.words('english') + list(str.punctuation)
    
    sents = sent_tokenize(text)
    for s in sents:
        words = word_tokenize(s.lower())
        tokenized_sents.append([w for w in words if w not in unwanted_words])
    
    return sents, tokenized_sents
 

In [None]:
# Get Term frequency 

def get_tf(tokenized_sents):
    tf = {}
    for s in tokenized_sents:
        for w in s:
            tf[w] = tf.get(w,0) + 1
            
    return tf

In [None]:
def word_overlap(s1, s2):
    # There is an error here which needs to be corrected
    #print("intersection:" , set(s1).intersection(set(s2)))
    return len(set(s1).intersection(set(s2)))

In [None]:
def cosine_sim(s1_vector, s2_vector):
    assert len(s1_vector) == len(s2_vector)
    num = sum([s1_vector[sid]*s2_vector[sid] for sid in range(len(s1_vector))])
    den1 = sum([s1_vector[sid]**2 for sid in range(len(s1_vector))])
    den2 = sum([s2_vector[sid]**2 for sid in range(len(s1_vector))])
    
    cosine_sim = num / (math.sqrt(den1)*math.sqrt(den2))
    return cosine_sim

In [None]:
def get_freqsum_summary(text):
    
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)

    scores = {}
    
    # Get best sentences based on term frequency
    for sid, s in enumerate(tokenized_sentences):
        scores[sid] = sum([tf.get(w,0) for w in s])/len(s)
        print(scores)
    
    #sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = False)
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]


In [3]:
def get_freqsum_summary1(text):
    
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)

    scores = {}
    words = {"lungs", "heart"}
    # Get best sentences based on term frequency
    for sid, s in enumerate(tokenized_sentences):
        scores[sid] = sum([tf.get(w,0) for w in s])/len(s)   
        print(scores)
    
    #sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = False)
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]


In [None]:
def get_sim_matrix(tokenized_sents, threshold=0.3):
    sim_mat = np.zeros((len(tokenized_sents), len(tokenized_sents)))
    for s1_id, s1 in enumerate(tokenized_sents):
        for s2_id, s2 in enumerate(tokenized_sents):
            if word_overlap(s1, s2) >= threshold:
                sim_mat[s1_id, s2_id] = 1
    return sim_mat

In [None]:
def get_degree_centrality_summary(text, threshold = 0.3):
        
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)    

    sim_mat = get_sim_matrix(tokenized_sentences, threshold)
    degree_centrality = sim_mat.sum(axis=1)

    scores = {}

    for id, d in enumerate(degree_centrality):
        scores[id] = d
    
    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]

In [None]:
def power_method(text, threshold=0.3, lam=0.15, max_num_iter = 100):

    delta = 2
    epsilon = 0.0001
    
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)    
    num_sents = len(original_sentences)
    
    sim_mat = lam/len(original_sentences) + (1-lam)*get_sim_matrix(tokenized_sentences, threshold)
    degree = np.sum(sim_mat, axis=1)

    sim_mat_norm = sim_mat/sim_mat.sum(axis=1)

    original_scores = np.array([1.0/num_sents for _ in original_sentences])

    num_iter = 0
    
    while delta > epsilon:
        #print(original_scores)
        print(sim_mat_norm.sum(axis=0))
        new_scores = np.matmul(sim_mat_norm, original_scores)
        #print(new_scores)
        
        delta = np.mean(abs(new_scores-original_scores))
        original_scores = new_scores
        
        print("Iteration :{}, Delta: {}".format(num_iter, delta))

        num_iter += 1
        if num_iter > max_num_iter:
            # Break if required delta not achieved in fixed iterataions            
            break

    scores = {}
    for id, d in enumerate(new_scores):
        scores[id] = d
    
    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]  

In [None]:
# Read a custom File
with open('./covid.txt') as f:
    text = f.read()

In [None]:
text

"Coronavirus disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first case was identified in Wuhan, China, in December 2019. It has since spread worldwide, leading to an ongoing pandemic.\n\nSymptoms of COVID-19 are variable, but often include fever, cough, fatigue, breathing difficulties, and loss of smell and taste. Symptoms begin one to fourteen days after exposure to the virus. Around one in five infected individuals do not develop any symptoms.[9] While most people have mild symptoms, some people develop acute respiratory distress syndrome (ARDS). ARDS can be precipitated by cytokine storms,[10] multi-organ failure, septic shock, and blood clots. Longer-term damage to organs (in particular, the lungs and heart) has been observed. There is concern about a significant number of patients who have recovered from the acute phase of the disease but continue to experience a range of effects—known as long COVID—for m

In [None]:
power_method(text)

Iteration :0, Delta: 0.009536677102536173
Iteration :1, Delta: 0.004118816794532439
Iteration :2, Delta: 0.002324900296814402
Iteration :3, Delta: 0.0017681777797249048
Iteration :4, Delta: 0.001375678336281
Iteration :5, Delta: 0.0010849779838001368
Iteration :6, Delta: 0.0008599845653785319
Iteration :7, Delta: 0.0006864134676947643
Iteration :8, Delta: 0.0005507685584746842
Iteration :9, Delta: 0.00044466962441790694
Iteration :10, Delta: 0.00036102830671455046
Iteration :11, Delta: 0.00029426010482774476
Iteration :12, Delta: 0.0002407723144910499
Iteration :13, Delta: 0.00019776378577883832
Iteration :14, Delta: 0.00016304538850671414
Iteration :15, Delta: 0.00013519104305710213
Iteration :16, Delta: 0.00011253674646943038
Iteration :17, Delta: 9.397055364550157e-05


['It can spread as early as two days before infected persons show symptoms, and from individuals who never experience symptoms.',
 '[9] While most people have mild symptoms, some people develop acute respiratory distress syndrome (ARDS).',
 'Symptoms of COVID-19 are variable, but often include fever, cough, fatigue, breathing difficulties, and loss of smell and taste.']

In [None]:
get_degree_centrality_summary(text)

['It can spread as early as two days before infected persons show symptoms, and from individuals who never experience symptoms.',
 '[9] While most people have mild symptoms, some people develop acute respiratory distress syndrome (ARDS).',
 'Symptoms of COVID-19 are variable, but often include fever, cough, fatigue, breathing difficulties, and loss of smell and taste.']

In [None]:
get_freqsum_summary(text)

{0: 2.2142857142857144}
{0: 2.2142857142857144, 1: 1.1428571428571428}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727, 7: 1.0909090909090908}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727, 7: 1.0909090909090908, 8: 1.0}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727, 7: 1.0909090909090908, 8: 1.0, 9: 1.43

['Other people are infected if the virus gets into their mouth, nose or eyes.',
 'It can spread as early as two days before infected persons show symptoms, and from individuals who never experience symptoms.',
 '[9] While most people have mild symptoms, some people develop acute respiratory distress syndrome (ARDS).']

In [None]:
get_freqsum_summary(text)

{0: 2.2142857142857144}
{0: 2.2142857142857144, 1: 1.1428571428571428}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727, 7: 1.0909090909090908}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727, 7: 1.0909090909090908, 8: 1.0}
{0: 2.2142857142857144, 1: 1.1428571428571428, 2: 1.5, 3: 2.0, 4: 2.7142857142857144, 5: 2.7142857142857144, 6: 2.727272727272727, 7: 1.0909090909090908, 8: 1.0, 9: 1.43

['Longer-term damage to organs (in particular, the lungs and heart) has been observed.',
 'The standard diagnosis method is by real-time reverse transcription polymerase chain reaction (rRT-PCR) from a nasopharyngeal swab.',
 'ARDS can be precipitated by cytokine storms,[10] multi-organ failure, septic shock, and blood clots.']

In [None]:
get_degree_centrality_summary(text)

intersection: {'acute', 'syndrome', 'disease', '2019', 'severe', 'respiratory', 'covid-19', 'contagious', '2', 'caused', 'sars-cov-2', 'coronavirus'}
intersection: {'2019'}
intersection: set()
intersection: {'covid-19'}
intersection: set()
intersection: set()
intersection: {'syndrome', 'respiratory', 'acute'}
intersection: set()
intersection: set()
intersection: {'disease', 'acute'}
intersection: {'severe'}
intersection: {'covid-19'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'severe'}
intersection: {'disease'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'2019'}
intersection: {'wuhan', 'identified', 'case', 'december', '2019', 'first', 'china'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
inte

['It can spread as early as two days before infected persons show symptoms, and from individuals who never experience symptoms.',
 '[9] While most people have mild symptoms, some people develop acute respiratory distress syndrome (ARDS).',
 'Symptoms of COVID-19 are variable, but often include fever, cough, fatigue, breathing difficulties, and loss of smell and taste.']

In [4]:
get_freqsum_summary1(text)

NameError: ignored

Answer 3

In [None]:
def word_overlap(s1, s2):
    
    print("intersection:" , set(s1).intersection(set(s2)))
    return len(set(s1).intersection(set(s2)))/(len(s1)+len(s2))

In [None]:
s1= "This is table"
s2= "This is my chair"
word_overlap(word_tokenize(s1), word_tokenize(s2))

intersection: {'is', 'This'}


2

**WORKING WITH CAMBODIA CORPUS**

In [None]:
from google.colab import files
upload = files.upload()

Saving cambodia.txt to cambodia (1).txt


In [None]:

with open('./cambodia.txt') as f:
    text = f.read()

In [None]:
text


"Cambodian leader Hun Sen on Friday rejected opposition parties' demands for talks outside the country, accusing them of trying to ``internationalize'' the political crisis.\nGovernment and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen's party to form a new government failed.\nOpposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing.\nHun Sen, however, rejected that.\n``I would like to make it clear that all meetings related to Cambodian affairs must be conducted in the Kingdom of Cambodia,'' Hun Sen told reporters after a Cabinet meeting on Friday.\n``No-one should internationalize Cambodian affairs.\nIt is detrimental to the sovereignty of Cambodia,'' he said.\nHun 

In [None]:
power_method(text)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
intersection: {'deal'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'deal'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: {'parliament'}
intersection: {'new', 'deal', 'government'}
intersection: set()
intersection: {'parliament', 'new'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: set()
intersection: {'approve', 'parliament', 'vote', 'needed', 'two-thirds', 'new', 'assures', 'deal', 'government'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()


['After a three-month impasse, they agreed last week to a coalition deal that will make Hun Sen sole prime minister and Ranariddh president of the National Assembly.',
 "Hun Sen's Cambodian People's Party won 64 of the 122 parliamentary seats in July's elections, short of the two-thirds majority needed to form a government on its own.",
 'The assurances were aimed especially at Sam Rainsy, leader of a vocally anti-Hun Sen opposition party, who was forced to take refuge in the U.N. offices in September to avoid arrest after Hun Sen accused him of being behind a plot against his life.']

In [None]:
get_freqsum_summary(text)

{0: 21.736842105263158}
{0: 21.736842105263158, 1: 27.083333333333332}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35, 8: 32.916666666666664}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35, 8: 32.916666666666664, 9: 9.5}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35

['The 75-year-old monarch suffers from a variety ailments and periodically makes extended trips to Beijing.',
 'The monitoring ended Sept. 30.',
 'He was diagnosed with colon cancer in 1993, but it has since gone into remission.']

In [None]:
get_degree_centrality_summary(text)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
intersection: set()
intersection: {'deal'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'deal'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: {'parliament'}
intersection: {'new', 'deal', 'government'}
intersection: set()
intersection: {'parliament', 'new'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: set()
intersection: {'approve', 'parliament', 'vote', 'needed', 'two-thirds', 'new', 'assures', 'deal', 'government'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()


["Hun Sen's Cambodian People's Party won 64 of the 122 parliamentary seats in July's elections, short of the two-thirds majority needed to form a government on its own.",
 'After a three-month impasse, they agreed last week to a coalition deal that will make Hun Sen sole prime minister and Ranariddh president of the National Assembly.',
 'Both Ranariddh and Sam Rainsy have been outside the country since parliament was ceremonially opened on Sep. 24.']

In [None]:
def get_sim_matrix(tokenized_sents, threshold=0.3):
    sim_mat = np.zeros((len(tokenized_sents), len(tokenized_sents)))
    for s1_id, s1 in enumerate(tokenized_sents):
        for s2_id, s2 in enumerate(tokenized_sents):
            if word_overlap(s1, s2) >= threshold:
                sim_mat[s1_id, s2_id] = 1
    return sim_mat

In [None]:
def power_method(text, threshold=0.3, lam=0.15, max_num_iter = 100):

    delta = 2
    epsilon = 0.0001
    
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)    
    num_sents = len(original_sentences)
    
    sim_mat = lam/len(original_sentences) + (1-lam)*get_sim_matrix(tokenized_sentences, threshold)
    degree = np.sum(sim_mat, axis=1)

    sim_mat_norm = sim_mat/sim_mat.sum(axis=1)

    original_scores = np.array([1.0/num_sents for _ in original_sentences])

    num_iter = 0
    
    while delta > epsilon:
        print("original score" ,original_scores)
        #print(sim_mat_norm.sum(axis=0))
        new_scores = np.matmul(sim_mat_norm, original_scores)
        print("new scores" ,new_scores)
        
        delta = np.mean(abs(new_scores-original_scores))
        original_scores = new_scores
        
        print("Iteration :{}, Delta: {}".format(num_iter, delta))

        num_iter += 1
        if num_iter > max_num_iter:
            # Break if required delta not achieved in fixed iterataions            
            break

    scores = {}
    for id, d in enumerate(new_scores):
        scores[id] = d
    
    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]  

In [None]:
power_method(text)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
intersection: {'deal'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'deal'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: {'parliament'}
intersection: {'new', 'deal', 'government'}
intersection: set()
intersection: {'parliament', 'new'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: set()
intersection: {'approve', 'parliament', 'vote', 'needed', 'two-thirds', 'new', 'assures', 'deal', 'government'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()


["Cambodian leader Hun Sen on Friday rejected opposition parties' demands for talks outside the country, accusing them of trying to ``internationalize'' the political crisis.",
 "Government and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen's party to form a new government failed.",
 "Opposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing."]

In [None]:
power_method(text)

original score [0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.

["Sam Rainsy, who earlier called Hun Sen's statement ``full of loopholes,'' asked Sihanouk for his help in obtaining a promise from Hun Sen that all members of the Sam Rainsy Party were free from prosecution for their political activities during and after last July's election.",
 "Opposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing.",
 "Worried that party colleagues still face arrest for their politics, opposition leader Sam Rainsy sought further clarification Friday of security guarantees promised by strongman Hun Sen. Sam Rainsy wrote in a letter to King Norodom Sihanouk that he was eager to attend the first session of the new National Assembly on Nov. 25, but complained that Hun Sen's assurances were not strong enough to ease concerns his party members may be arrested upon their 

In [None]:
power_method(text)

original score [0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448 0.00546448
 0.00546448 0.00546448 0.00546448 0.00546448 0.

["Hun Sen's party recently called on Ranariddh to return to the negotiation table and said it was willing to make an ``appropriate concession'' to break the deadlock over forming a government.",
 "Opposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing.",
 "Sam Rainsy, who earlier called Hun Sen's statement ``full of loopholes,'' asked Sihanouk for his help in obtaining a promise from Hun Sen that all members of the Sam Rainsy Party were free from prosecution for their political activities during and after last July's election."]

In [None]:
get_degree_centrality_summary(text, threshold= 0.1)

["Hun Sen's party recently called on Ranariddh to return to the negotiation table and said it was willing to make an ``appropriate concession'' to break the deadlock over forming a government.",
 "Opposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing.",
 "Sam Rainsy, who earlier called Hun Sen's statement ``full of loopholes,'' asked Sihanouk for his help in obtaining a promise from Hun Sen that all members of the Sam Rainsy Party were free from prosecution for their political activities during and after last July's election."]

In [None]:
get_degree_centrality_summary(text, threshold= 0.6)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
intersection: set()
intersection: {'deal'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'deal'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: {'parliament'}
intersection: set()
intersection: set()
intersection: set()
intersection: {'parliament'}
intersection: {'new', 'deal', 'government'}
intersection: set()
intersection: {'parliament', 'new'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: set()
intersection: {'new', 'government'}
intersection: set()
intersection: {'approve', 'parliament', 'vote', 'needed', 'two-thirds', 'new', 'assures', 'deal', 'government'}
intersection: set()
intersection: set()
intersection: set()
intersection: set()


["Cambodian leader Hun Sen on Friday rejected opposition parties' demands for talks outside the country, accusing them of trying to ``internationalize'' the political crisis.",
 "Government and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen's party to form a new government failed.",
 "Opposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing."]



---



## **ANSWER 7**

In [20]:
# Imports
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string as str
import math

In [21]:
from google.colab import files


uploaded = files.upload()

Saving cambodia.txt to cambodia (1).txt


In [22]:
# Read a custom File
with open('./cambodia.txt') as f:
    text = f.read()

In [23]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:

def get_tf(tokenized_sents):
    tf = {}
    for s in tokenized_sents:
        for w in s:
            tf[w] = tf.get(w,0) + 1
            
    return tf

In [28]:
# Get sentences and words after preprocessing

def get_tokenized_sents(text):

    tokenized_sents = []
    
    # remove frequent words and punctuations
    unwanted_words = stopwords.words('english') + list(str.punctuation)
    
    sents = sent_tokenize(text)
    for s in sents:
        words = word_tokenize(s.lower())
        tokenized_sents.append([w for w in words if w not in unwanted_words])
    
    return sents, tokenized_sents

In [29]:
def get_freqsum_summary2(text):
    
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)

    scores = {}
    words = {"Cambodian", "Rainsy"}
    # Get best sentences based on term frequency
    for sid, s in enumerate(tokenized_sentences):
        scores[sid] = sum([tf.get(w,0) for w in s])/len(s)   
        print(scores)
    
    #sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = False)
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]


In [30]:
get_freqsum_summary2(text)

{0: 21.736842105263158}
{0: 21.736842105263158, 1: 27.083333333333332}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35, 8: 32.916666666666664}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35, 8: 32.916666666666664, 9: 9.5}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35

['The 75-year-old monarch suffers from a variety ailments and periodically makes extended trips to Beijing.',
 'The monitoring ended Sept. 30.',
 'He was diagnosed with colon cancer in 1993, but it has since gone into remission.']

**taking another examples**

In [31]:
def get_freqsum_summary2(text):
    
    original_sentences, tokenized_sentences = get_tokenized_sents(text)
    tf = get_tf(tokenized_sentences)

    scores = {}
    words = {"ceremonial", "Paris"}
    # Get best sentences based on term frequency
    for sid, s in enumerate(tokenized_sentences):
        scores[sid] = sum([tf.get(w,0) for w in s])/len(s)   
        print(scores)
    
    #sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
    sorted_scores = sorted(scores.items(), key = lambda x : x[1], reverse = False)
    return [original_sentences[s[0]] for s in sorted_scores[0:3]]


In [44]:
get_freqsum_summary2(text)

{0: 21.736842105263158}
{0: 21.736842105263158, 1: 27.083333333333332}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35, 8: 32.916666666666664}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35, 8: 32.916666666666664, 9: 9.5}
{0: 21.736842105263158, 1: 27.083333333333332, 2: 25.0, 3: 46.75, 4: 17.80952380952381, 5: 13.2, 6: 21.8, 7: 31.35

['The 75-year-old monarch suffers from a variety ailments and periodically makes extended trips to Beijing.',
 'The monitoring ended Sept. 30.',
 'He was diagnosed with colon cancer in 1993, but it has since gone into remission.']