In [11]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Aditya
[nltk_data]     lakhani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aditya
[nltk_data]     lakhani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def preprocess(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [word for word in words if word not in stop_words]
        preprocessed_sentences.append(filtered_words)
    return preprocessed_sentences

In [13]:
def sentence_similarity(sentence1, sentence2):
    words1 = set(sentence1)
    words2 = set(sentence2)
    if len(words1) == 0 or len(words2) == 0:
        return 0
    return len(words1.intersection(words2)) / (np.log(len(words1)) + np.log(len(words2)))

def build_similarity_matrix(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
    return similarity_matrix

In [14]:
def pagerank(similarity_matrix, damping=0.85, max_iterations=100, tolerance=0.0001):
    scores = np.ones(len(similarity_matrix))
    for _ in range(max_iterations):
        new_scores = (1 - damping) + damping * np.dot(similarity_matrix.T, scores)
        if np.linalg.norm(scores - new_scores) < tolerance:
            break
        scores = new_scores
    return scores

In [15]:
def generate_summary(text, num_sentences=3):
    preprocessed_sentences = preprocess(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    scores = pagerank(similarity_matrix)
    ranked_sentences = sorted(((scores[i], i) for i in range(len(scores))), reverse=True)
    top_sentences_indices = [i for _, i in ranked_sentences[:num_sentences]]
    top_sentences_indices.sort()
    summary = ' '.join([sent_tokenize(text)[i] for i in top_sentences_indices])
    return summary

# Example usage:
file = open("test.txt", "r", encoding = "utf8")
text = file.read()

summary = generate_summary(text)
print(summary)


Thus, Huang Xiaolong was led by Su Yan and Huang Peng to the Martial Hall of the Huang Clan Manor. This old man was none other Huang Xiaolong’s grandfather, Huang Qide, the Huang Clan Manor’s lord for the past forty years, the very person who established the Huang Clan. Huang Ming, Huang Peng and Su Yan also moved forward quickly: "Dad!” 
 Huang Xiaolong and Huang Wei both came forward: "Grandpa.” 
 Huang Qide smiled, then nodded his head at Huang Ming’s group before turning to face the rest of Huang Clan Manor’s people-- he smiled and said: "Do not stand on ceremony.
