In [1]:
# Importing all the libraries
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Reading file and preprocessing text data
def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []

    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop()

    return sentences

In [3]:
# sentence similarity
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

In [4]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [5]:
def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    print("Indexes of top ranked_sentence order are ", ranked_sentence)

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))

In [6]:
# let's begin
generate_summary( "/content/msft.txt", 2)

In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills
Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and 

In [7]:
# Summary for trump file
generate_summary( r"C:\Users\PULAPA YESHWANTH\Downloads\trump.txt", 2)


WASHINGTON - The Trump administration has ordered the military to start withdrawing roughly 7,000 troops from Afghanistan in the coming months, two defense officials said Thursday, an abrupt shift in the 17-year-old war there and a decision that stunned Afghan officials, who said they had not been briefed on the plans.President Trump made the decision to pull the troops - about half the number the United States has in Afghanistan now - at the same time he decided to pull American forces out of Syria, one official said.The announcement came hours after Jim Mattis, the secretary of defense, said that he would resign from his position at the end of February after disagreeing with the president over his approach to policy in the Middle East.The whirlwind of troop withdrawals and the resignation of Mr
Mattis leave a murky picture for what is next in the United States’ longest war, and they come as Afghanistan has been troubled by spasms of violence afflicting the capital, Kabul, and other i

In [8]:
# summary for fb file
generate_summary(r"C:\Users\PULAPA YESHWANTH\Downloads\fb.txt", 2)

For years, Facebook gave some of the world's largest technology companies more intrusive access to users' personal data than it has disclosed, effectively exempting those business partners from its usual privacy rules, according to internal records and interviews
The special arrangements are detailed in hundreds of pages of Facebook documents obtained by The New York Times
The records, generated in 2017 by the company's internal system for tracking partnerships, provide the most complete picture yet of the social network's data-sharing practices
They also underscore how personal data has become the most prized commodity of the digital age, traded on a vast scale by some of the most powerful companies in Silicon Valley and beyond
The exchange was intended to benefit everyone
Pushing for explosive growth, Facebook got more users, lifting its advertising revenue
Partner companies acquired features to make their products more attractive
Facebook users connected with friends across differen

### 5 articles and it's summary using modified text cleaning

In [9]:
import re

def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []

    for sentence in article:
        # Replacing special chracters with space
        specialSentences = re.sub("[^a-zA-Z]", " ", sentence)
        # Uppercase letters to lowercase
        processedsentence = specialSentences.lower().split()
        sentences.append(processedsentence)
    sentences.pop()

    return sentences


In [10]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

In [11]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [12]:
def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    print("Indexes of top ranked_sentence order are ", ranked_sentence)

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))

In [13]:
# Buisiness summary
generate_summary( r"C:\Users\PULAPA YESHWANTH\Downloads\Buisiness.txt", 2)

Indexes of top ranked_sentence order are  [(0.5, ['it', 'encompasses', 'the', 'activities', 'related', 'to', 'the', 'production', 'distribution', 'and', 'exchange', 'of', 'goods', 'and', 'services', 'in', 'order', 'to', 'satisfy', 'the', 'needs', 'and', 'wants', 'of', 'consumers']), (0.5, ['business', 'is', 'a', 'fundamental', 'aspect', 'of', 'modern', 'society', 'driving', 'economic', 'growth', 'innovation', 'and', 'employment', 'opportunities'])]
Summarize Text: 
 it encompasses the activities related to the production distribution and exchange of goods and services in order to satisfy the needs and wants of consumers. business is a fundamental aspect of modern society driving economic growth innovation and employment opportunities


In [14]:
# Education summary
generate_summary( r"C:\Users\PULAPA YESHWANTH\Downloads\Education.txt", 2)

Indexes of top ranked_sentence order are  [(0.5, ['it', 'is', 'the', 'process', 'of', 'acquiring', 'knowledge', 'skills', 'values', 'and', 'attitudes', 'through', 'formal', 'or', 'informal', 'learning', 'experiences']), (0.5, ['education', 'is', 'a', 'fundamental', 'pillar', 'of', 'human', 'development', 'and', 'societal', 'progress'])]
Summarize Text: 
 it is the process of acquiring knowledge skills values and attitudes through formal or informal learning experiences. education is a fundamental pillar of human development and societal progress


In [15]:
# Religion summary
generate_summary(r'C:\Users\PULAPA YESHWANTH\Downloads\Religion.txt',2)

Indexes of top ranked_sentence order are  [(0.5, ['religion', 'is', 'a', 'deeply', 'rooted', 'aspect', 'of', 'human', 'culture', 'and', 'identity', 'encompassing', 'beliefs', 'practices', 'rituals', 'and', 'values', 'that', 'revolve', 'around', 'the', 'existence', 'of', 'a', 'higher', 'power', 'or', 'powers']), (0.5, ['it', 'plays', 'a', 'significant', 'role', 'in', 'shaping', 'individuals', 'worldview', 'moral', 'compass', 'and', 'sense', 'of', 'purpose'])]
Summarize Text: 
 religion is a deeply rooted aspect of human culture and identity encompassing beliefs practices rituals and values that revolve around the existence of a higher power or powers. it plays a significant role in shaping individuals worldview moral compass and sense of purpose


In [16]:
# Sports summary
generate_summary(r'C:\Users\PULAPA YESHWANTH\Downloads\Sports.txt',2)

Indexes of top ranked_sentence order are  [(0.5, ['they', 'encompass', 'a', 'wide', 'range', 'of', 'physical', 'activities', 'and', 'competitive', 'games', 'that', 'involve', 'skill', 'strategy', 'teamwork', 'and', 'athleticism']), (0.5, ['sports', 'play', 'a', 'significant', 'role', 'in', 'society', 'captivating', 'people', 'of', 'all', 'ages', 'and', 'backgrounds'])]
Summarize Text: 
 they encompass a wide range of physical activities and competitive games that involve skill strategy teamwork and athleticism. sports play a significant role in society captivating people of all ages and backgrounds


In [17]:
generate_summary(r'C:\Users\PULAPA YESHWANTH\Downloads\politics.txt',2)

Indexes of top ranked_sentence order are  [(0.5, ['politics', 'plays', 'a', 'crucial', 'role', 'in', 'shaping', 'the', 'governance', 'and', 'decision', 'making', 'processes', 'of', 'a', 'society']), (0.5, ['it', 'encompasses', 'the', 'activities', 'actions', 'and', 'policies', 'through', 'which', 'individuals', 'and', 'groups', 'strive', 'to', 'gain', 'and', 'exercise', 'power', 'in', 'order', 'to', 'influence', 'or', 'control', 'public', 'affairs'])]
Summarize Text: 
 politics plays a crucial role in shaping the governance and decision making processes of a society. it encompasses the activities actions and policies through which individuals and groups strive to gain and exercise power in order to influence or control public affairs
