### Text Summarizer 

In [1]:
# importing libraries
import pandas as pd
import numpy as np

### Loading the data

In [2]:
with open('aa.txt', 'r') as file:
    data = file.read().replace('\n', '')

### Data Cleaning

In [3]:
import re
import nltk
from nltk.corpus import stopwords

In [4]:
# importing the nltk libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

### Creating the Frequency Table

In [5]:
# preprocessing the input data
# tokeinzing the input
# removal of stop_words and then stemming words
# creating the dictionary for the word frequency table
# using words that are not part of stopwords array

def frequency_table(data):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(data)
    ps = PorterStemmer()
    '''Stemmer: an algorithm to bring words to its root word
    args:
            text(str): original text
       yeilds:
            dict: the next token'''
    freqtable = {}
    
    for word in words:
        word = ps.stem(word)
        if word in stop_words:
            continue
        if word in freqtable:
            freqtable[word] += 1
        else:
            freqtable[word] = 1
    return freqtable

### Scoring each sentence

In [6]:
# assigning score to each sentence
# basic algorithm
# adding freq of every non stop_words in a sentence
def sentence_score(sents, freqtable):
    sent_value = {}
    '''long sentences will have an advantage over short sentences.
    Dividing every sentence score by the number of words in the sentence.'''
    for sent in sents:
        word_count = len(word_tokenize(sent))
        for value in freqtable:
            if value in sent.lower():
                if sent[:10] in sent_value:
                    sent_value[sent[:10]] += freqtable[value]
                else:
                    sent_value[sent[:10]] = freqtable[value]
                
        sent_value[sent[:10]] = sent_value[sent[:10]] // word_count
    return sent_value


### Creating the average value of the summary

In [7]:
# creating Average value of a sentence from original summary_text
# average itself is the best threshold

def averages(sent_value):
    sumvalue = 0
    for ele in sent_value:
        sumvalue += sent_value[ele]
        
    average = int(sumvalue / len(sent_value))
    
    return average

### Generating the summary

In [8]:
# generating the summary

def summary_generate(sents, sent_value, threshold):
    sent_count = 0
    summary = ''
    
    for sent in sents:
        if sent[:10] in sent_value and sent_value[sent[:10]] > (threshold):
            summary += ' ' + sent
            sent_count += 1
            
    return summary

In [9]:
# creating the word frequency table
freq_table = frequency_table(data)

# tokenizing the sentences
'''Sentence tokenizer is already present, for creating the array of sentences, sent_tokenize() is used'''
sents = sent_tokenize(data)

# determining the score of each sentences
sent_scores = sentence_score(sents, freq_table)

#average sentence score is set as threshold, ..can try other too
threshold = averages(sent_scores)

# generating the summary
# for shorter summary the threshold i use is 1 and 1/2 times the average
summary = summary_generate(sents, sent_scores, 1.5*threshold)

# printing the result
print("Original Text: \n\t", data)
print("\nSummarized Text: \n\t", summary)

Original Text: 
	 Britain boosted by Holmes doubleAthletics fans endured a year of mixed emotions in 2004 as stunning victories went hand-in-hand with disappointing defeats and more drugs scandals.Kelly Holmes finally fulfilled her potential by storming to double gold on the track at the Olympic Games. Holmes helped erase the gloom hanging over Team GB after their biggest medal hope, Paula Radcliffe, dropped out of the marathon and then the 10,000m. Britain's men's 4x100m relay team also did their bit by taking a shock gold. Holmes had started the year in disappointing style, falling over in the final of 1500m at the World Indoor Championships where she was favourite. Her Olympic build-up was clouded by self doubt but that proved unfounded as she overhauled rival Maria Mutola to win the 800m - her first global title. Just five days later, the 34-year-old made it double gold in the 1500m. It was the first time in 84 years a Briton has achieved the Olympic middle-distance double. While H