In [1]:
import math
import pandas as pd
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

df=pd.read_csv("news.csv")
df= df.dropna()
df.drop(columns=['published_at','topic','title','source'],axis=0,inplace=True)
df.head()
x=df['content']
x_train, x_test= train_test_split(x,test_size=0.1, random_state=42)


def _create_frequency_table(text_string) -> dict:
   
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable


def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()): 
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix


def _score_sentences(tf_idf_matrix) -> dict:
    
    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue


def _find_average_score(sentenceValue) -> int:
    
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    average = (sumValues / len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1
    return summary


def run_summarization(text):

    sentences = sent_tokenize(text)
    total_documents = len(sentences)

    freq_matrix = _create_frequency_matrix(sentences)

    tf_matrix = _create_tf_matrix(freq_matrix)

    count_doc_per_words = _create_documents_per_words(freq_matrix)

    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)

    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)

    sentence_scores = _score_sentences(tf_idf_matrix)

    threshold = _find_average_score(sentence_scores)
    
    summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
    return summary


result = run_summarization(x_test.iloc[10])
print(result)
print(x_test.iloc[10])

 BTS is known to keep their private lives closely guarded. Others are discussing that he is such an eternal romantic and a hopeless one at that. But the affair was rather short-lived.
BTS is known to keep their private lives closely guarded. BTS leader RM aka Kim Namjoon has said that when he thinks of love, he does not only think of romantic love but likes to see it in a more all-encompassing manner. He is soon coming on the show, The Mysterious Dictionary Of Useless Human Knowledge. He is one of the anchors for the show that comes on MC TVN channel in South Korea. Kim Namjoon aka BTS RM is seen having a chat where he says that when he thinks of love he thinks of the huge umbrella of emotions that also include dislike and jealousy Another person tells him that once two people get too familiar in love, all the sensations like butterflies in the stomach go missing. BTS RM aka Kim Namjoon looks disappointed, and says it is so sad. On seeing the clip, BTS ARMY has taken to Twitter to expl