In [57]:
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore()
from pprint import pprint

In [58]:
import spacy

In [59]:

from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd

In [60]:
from summarization_algorithm import * 

In [61]:
df = pd.read_csv("dataset.csv")
df=df.iloc[:,1:]#remove first column(unnamed col)
df.dropna(inplace=True)
columns_titles = ["original","summary"]
df=df.reindex(columns=columns_titles)
df

Unnamed: 0,original,summary
0,The Daman and Diu administration on Wednesday ...,The Administration of Union Territory Daman an...
1,"From her special numbers to TV?appearances, Bo...",Malaika Arora slammed an Instagram user who tr...
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Hotels in Mumbai and other Indian cities are t...,Hotels in Maharashtra will train their staff t...
4,An alleged suspect in a kidnapping case was fo...,A 32-year-old man on Wednesday was found hangi...
...,...,...
4510,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m...",Fruit juice concentrate maker Rasna is eyeing ...
4511,Former cricketer Sachin Tendulkar was spotted ...,Former Indian cricketer Sachin Tendulkar atten...
4512,"Aamir Khan, whose last film Dangal told the st...","Aamir Khan, while talking about reality shows ..."
4513,Maharahstra Power Minister Chandrashekhar Bawa...,The Maharashtra government has initiated an in...


In [62]:
def preprocessing(text):
    sentences = text

    # Load the model (English) into spaCy
    nlp = spacy.load('en_core_web_sm')

    # Adding 'sentencizer' component to the pipeline
    nlp.add_pipe('sentencizer')

    # Tokenization & Lemmatization
    lemmatized_sentences = []

    doc = nlp(sentences)

    sentences = []
    for sentence in doc.sents:
        sentences.append(sentence.text)
        lemmatized_sentences.append([token.lemma_ for token in sentence])


    # Removing Stop Words & Punctuation 
    filtered_sentences = []

    for sentences_group in lemmatized_sentences:
        filtered = ""

        for sentence in sentences_group:
            sentence_doc = nlp(sentence)
            words_of_sentence = [token.text for token in sentence_doc]

            for token in sentence_doc:
                if token.is_stop == False and token.text.isalpha() == True:
                    filtered += token.text + " "

        filtered_sentences.append(filtered)

    return sentences, filtered_sentences

In [63]:
sentences, filtered_sentences = preprocessing(df.iloc[0,0])

# print("--- Sentences")
# for sentence in sentences:
#     print(sentence, "\n")
    
# print("=================================================================================================================\n")    
    
# print("--- Filtered Sentences")
# for sentence in filtered_sentences:
#     print(sentence, "\n")

In [64]:
type(sentences[0])

str

In [65]:
def calculate_efficiency(predicted_summary,original_summary):
    from torchmetrics.text.rouge import ROUGEScore
    rouge = ROUGEScore()
    from pprint import pprint
    return rouge(predicted_summary, original_summary)

In [66]:
def summarize_with(df, summary_algorithm,size = 2):
    rows, columns = df.shape
    #rows = 200#uncomment this for runtime speed
    summarized_text = []
    for row in range(rows):
        original_text = df.iloc[row,0]
        sentences, filtered_sentences = preprocessing(original_text)#tokenization here
                                  #(filtered_sentences,sentence)
        summary = summary_algorithm(filtered_sentences,sentences,size)
        summarized_text.append(summary)
    summary_df = pd.DataFrame (summarized_text, columns = [f'{summary_algorithm.__name__} summary'])
    return summary_df

In [67]:
luhn = summarize_with(df,luhn_algorithm).reset_index(drop=True)


KeyboardInterrupt: 

In [None]:
text_matching = summarize_with(df,text_matching_algorithm).reset_index(drop=True)


In [None]:
text_matching

Unnamed: 0,text_matching_algorithm summary
0,The union territory?s administration was force...
1,A post shared by Malaika Arora Khan (@malaikaa...
2,IGIMS medical superintendent Dr Manish Mandal ...
3,The group behind the initiative is also develo...
4,A team was sent to Kumar?s village but when he...
...,...
4391,The company has invested Rs 100 crore so far o...
4392,Former cricketer Sachin Tendulkar was spotted ...
4393,"Aamir Khan, whose last film Dangal told the st..."
4394,Also read |Â Maha assures action in hefty pow...


In [68]:
text_matching.to_csv('text_matching.csv') # False: not include index


In [None]:
lsa = summarize_with(df,lsa_summarization).reset_index(drop=True)


In [None]:
LexRank = summarize_with(df,LexRank_algorithm).reset_index(drop=True)

In [None]:
horizontal_concat_df = pd.concat(
    [
        df.reset_index(drop=True),
        luhn.reset_index(drop=True),
        lsa.reset_index(drop=True),
        text_matching.reset_index(drop=True),
        LexRank.reset_index(drop=True)
    ],
    axis=1,
)

horizontal_concat_df

Unnamed: 0,original,summary,luhn_algorithm summary,lsa_summarization summary,text_matching_algorithm summary,LexRank_algorithm summary
0,The Daman and Diu administration on Wednesday ...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,The RSS is the ideological parent of the rulin...,The union territory?s administration was force...,The union territory?s administration was force...
1,"From her special numbers to TV?appearances, Bo...",Malaika Arora slammed an Instagram user who tr...,A post shared by Malaika Arora Khan (@malaikaa...,(read alimony) money to wear ?short clothes an...,A post shared by Malaika Arora Khan (@malaikaa...,The details of the alimony are only known to M...
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...,"In its response, the management of the autonom...",on the marital declaration form be immediately...,IGIMS medical superintendent Dr Manish Mandal ...,The previous version of the marital declaratio...
3,Hotels in Mumbai and other Indian cities are t...,Hotels in Maharashtra will train their staff t...,Human trafficking is the world's fastest growi...,RESCUE ME APPThe Rescue Me app - to be launche...,The group behind the initiative is also develo...,Hotels in Mumbai and other Indian cities are t...
4,An alleged suspect in a kidnapping case was fo...,A 32-year-old man on Wednesday was found hangi...,"A native of Kasganj in UP, Kumar was unmarried...","Kumar was one of them,?Their relationship ende...",A team was sent to Kumar?s village but when he...,"said a police officer.Kumar was one of them,?"
...,...,...,...,...,...,...
4391,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m...",Fruit juice concentrate maker Rasna is eyeing ...,,,,
4392,Former cricketer Sachin Tendulkar was spotted ...,Former Indian cricketer Sachin Tendulkar atten...,,,,
4393,"Aamir Khan, whose last film Dangal told the st...","Aamir Khan, while talking about reality shows ...",,,,
4394,Maharahstra Power Minister Chandrashekhar Bawa...,The Maharashtra government has initiated an in...,,,,


In [None]:
horizontal_concat_df[horizontal_concat_df.isnull().any(axis=1)]

Unnamed: 0,original,summary,luhn_algorithm summary,lsa_summarization summary,text_matching_algorithm summary,LexRank_algorithm summary
200,Kamal Haasan and Bigg Boss Tamil have been pla...,Political party Pudhiya Thamilagam has filed ?...,,,,
201,"A 14-year-old boy in Andheri, a suburb of Mumb...",A 14-year-old Mumbai boy killed himself on Sat...,,,,
202,The braids of three women in an outer Delhi vi...,After reports of approximately a dozen such ca...,,,,
203,"Los Angeles, Jul 30 (PTI) Actor TJ Miller says...","Actor TJ Miller, who played the superhero's be...",,,,
204,A fire broke out on Monday at Shastri Bhawan i...,A fire broke out on the seventh floor of Delhi...,,,,
...,...,...,...,...,...,...
4391,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m...",Fruit juice concentrate maker Rasna is eyeing ...,,,,
4392,Former cricketer Sachin Tendulkar was spotted ...,Former Indian cricketer Sachin Tendulkar atten...,,,,
4393,"Aamir Khan, whose last film Dangal told the st...","Aamir Khan, while talking about reality shows ...",,,,
4394,Maharahstra Power Minister Chandrashekhar Bawa...,The Maharashtra government has initiated an in...,,,,


In [None]:
def df_sentences_efficiency(df,summary_df):
    sentences_efficiency = []
    columns, rows = summary_df.shape
    for column in range(columns):
        #print(column)
        predicted_summary = summary_df.iloc[column,0]
        original_summary = df.iloc[column,0]
        efficiency_dict = calculate_efficiency(predicted_summary,original_summary)
        #pd.DataFrame(data, columns=['Name','Age','Hat Color'])
        sentences_efficiency.append(efficiency_dict)
        #print(sentences_efficiency)
            
    dataframe = pd.DataFrame(sentences_efficiency)
    return dataframe

In [None]:
luhn_with_scores = pd.concat([luhn,df_sentences_efficiency(df,luhn)],axis = 1)
text_matching_with_scores = pd.concat([text_matching,df_sentences_efficiency(df,text_matching)],axis = 1)
lsa_with_scores = pd.concat([lsa,df_sentences_efficiency(df,lsa)],axis = 1)
LexRank_with_scores = pd.concat([LexRank,df_sentences_efficiency(df,LexRank)],axis = 1)

KeyboardInterrupt: 

In [None]:
lsa_with_scores

Unnamed: 0,luhn_algorithm summary,rouge1_fmeasure,rouge1_precision,rouge1_recall,rouge2_fmeasure,rouge2_precision,rouge2_recall,rougeL_fmeasure,rougeL_precision,rougeL_recall,rougeLsum_fmeasure,rougeLsum_precision,rougeLsum_recall
0,The Daman and Diu administration on Wednesday ...,tensor(0.3370),tensor(1.),tensor(0.2027),tensor(0.3341),tensor(1.),tensor(0.2005),tensor(0.3370),tensor(1.),tensor(0.2027),tensor(0.3370),tensor(1.),tensor(0.2027)
1,A post shared by Malaika Arora Khan (@malaikaa...,tensor(0.3783),tensor(1.),tensor(0.2333),tensor(0.3758),tensor(1.),tensor(0.2313),tensor(0.3783),tensor(1.),tensor(0.2333),tensor(0.3783),tensor(1.),tensor(0.2333)
2,"In its response, the management of the autonom...",tensor(0.3081),tensor(1.),tensor(0.1821),tensor(0.2998),tensor(0.9839),tensor(0.1768),tensor(0.2152),tensor(0.6984),tensor(0.1272),tensor(0.3081),tensor(1.),tensor(0.1821)
3,Human trafficking is the world's fastest growi...,tensor(0.2613),tensor(1.),tensor(0.1503),tensor(0.2557),tensor(0.9875),tensor(0.1468),tensor(0.1645),tensor(0.6296),tensor(0.0946),tensor(0.2613),tensor(1.),tensor(0.1503)
4,"A native of Kasganj in UP, Kumar was unmarried...",tensor(0.3333),tensor(1.),tensor(0.2000),tensor(0.3263),tensor(0.9872),tensor(0.1954),tensor(0.2532),tensor(0.7595),tensor(0.1519),tensor(0.3333),tensor(1.),tensor(0.2000)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Talking about the controversies surrounding th...,tensor(0.2898),tensor(1.),tensor(0.1694),tensor(0.2816),tensor(0.9833),tensor(0.1643),tensor(0.2898),tensor(1.),tensor(0.1694),tensor(0.2898),tensor(1.),tensor(0.1694)
196,A post shared by Bhumi Pednekar (@psbhumi) on ...,tensor(0.3567),tensor(1.),tensor(0.2171),tensor(0.3497),tensor(0.9880),tensor(0.2124),tensor(0.2463),tensor(0.6905),tensor(0.1499),tensor(0.3567),tensor(1.),tensor(0.2171)
197,RULES TO PROBE CORRUPTION CASES AMENDED Changi...,tensor(0.5978),tensor(1.),tensor(0.4264),tensor(0.5902),tensor(0.9908),tensor(0.4202),tensor(0.4783),tensor(0.8000),tensor(0.3411),tensor(0.4837),tensor(0.8091),tensor(0.3450)
198,Opposition Congress said in Rajya Sabha on Mon...,tensor(0.4070),tensor(1.),tensor(0.2555),tensor(0.4035),tensor(1.),tensor(0.2527),tensor(0.4070),tensor(1.),tensor(0.2555),tensor(0.4070),tensor(1.),tensor(0.2555)
