In [1]:
%run summarizationBackbone.ipynb

Loading Big File...
Finished Loading Big File.


In [2]:
from preprocessing_algorithms import *
from tqdm import tqdm

In [3]:
ds = pd.read_csv("Duc_dataset_first_ref_summary.csv")

# Summarization Settings

In [60]:
# Dataset Column Names
articleCol = "Original Article"
summaryCol = "Original Summary"
# Documents to be summarized
start = 0
end = len(ds)
# Summarization Percentage [0-1]% if > 1 acts as number of sentences
percentage = 5
# Pre processing settings
lemmatization = True
remove_stopwords = True
# output file name [MAKE SURE TO CHANGE TO NOT OVERWRITE]
outFileName = "DUC_Weighted_1st_5Sent_lemT_swT.csv"
ppFileName = "PP_DUC_1st_5Sent_lemT_swT.csv"
ppSpecial = "#!@!#"
# Weights Settings
useWeights = True
weights = {"tm":0.92,"luhn":1,"lsa":0.44,"tr":0.84,"lex":0.88,"lda":0.8}

In [90]:
Algorithms = ["Tm","Lex","Luhn","Lsa","Tr","LDA"]
lstCols = get_combinations(Algorithms)
cols = ["Original Article","Original Summary"] + lstCols
df3 = pd.DataFrame(columns = cols)
df3.to_csv(outFileName, index=False)

## Pre Processing

In [16]:
sentences, processed_sentences = process_one_column_df(ds[articleCol],lemmatization,remove_stopwords)

100%|██████████| 49/49 [07:24<00:00,  9.08s/it]


In [62]:
collected = []
for i in range(len(sentences)):
    collected.append([ds[articleCol][i],ds[summaryCol][i],ppSpecial.join(sentences[i]),ppSpecial.join(processed_sentences[i])])

In [68]:
df = pd.DataFrame(collected, columns =[articleCol,summaryCol,"Sentences","Filtered Sent"]) 

In [69]:
df.to_csv(ppFileName, index=False)

In [70]:
data = pd.read_csv(ppFileName)

In [74]:
data.head()

Unnamed: 0,Original Article,Original Summary,Sentences,Filtered Sent
0,\nCambodian leader Hun Sen on Friday rejected ...,Prospects were dim for resolution of the polit...,\nCambodian leader Hun Sen on Friday rejected ...,cambodian leader hun sen friday reject opposit...
1,\nHonduras braced for potential catastrophe Tu...,Hurricane Mitch approached Honduras on Oct. 27...,\nHonduras braced for potential catastrophe Tu...,honduras brace potential catastrophe tuesday h...
2,\nCuban President Fidel Castro said Sunday he ...,Britain caused international controversy and C...,\nCuban President Fidel Castro said Sunday he ...,cuban president fidel castro sunday disagree a...
3,"\nMUNICH, Germany (AP) _ U.S. prosecutors have...",After the bombing of U.S. embassies in East Af...,"\nMUNICH, Germany (AP) _#!@!#U.S. prosecutors ...",munich germany ap#!@!#prosecutor ask 20 - day ...
4,\nIn a critical ruling for the North American ...,In a dispute over a new collective bargaining ...,\nIn a critical ruling for the North American ...,critical rule north american national basketba...


# one by one docs

In [80]:
def onebyoneSummarization(ds):
    summarizedDataset = []
    for i in tqdm(range(start,end,1)):
        try:
            sentences, filtered_sentences = preprocessing_text_with_spacy(ds[articleCol][i],lemmatization,remove_stopwords)
            df = buildDF(filtered_sentences, sentences, useWeights, weights)

            summarizedRow = {"Original Article": ds[articleCol][i],"Original Summary": ds[summaryCol][i]}
            for key in df.keys():
                element = summarizeWith(sentences, df, key, percentage)
                summarizedRow[key] = element
            pd.DataFrame.from_dict(summarizedRow, orient='index').T.to_csv(outFileName,mode='a',header=False,index=False)
            # break
        except Exception as e:
            print("Error",i, e)

In [76]:
def predict(text, percentage):
    try:
        sentences, filtered_sentences = preprocessing_text_with_spacy(text,True,False)
        df = buildDF(filtered_sentences, sentences)
        summarizedRow = {"Original Article": ds[articleCol][i]}
        for key in df.keys():
            element = summarizeWith(sentences, df, key, percentage)
            summarizedRow[key] = element
        return summarizedRow
    except Exception as e:
        print("Error",i, e)

## Doc Level scoring

In [96]:
%run summarizationBackboneDocLvl.ipynb

Loading Big File...
Finished Loading Big File.


In [97]:
lstCols = Algorithms
data = pd.read_csv(ppFileName)

In [98]:
data.head(1)

Unnamed: 0,Original Article,Original Summary,Sentences,Filtered Sent
0,\nCambodian leader Hun Sen on Friday rejected ...,Prospects were dim for resolution of the polit...,\nCambodian leader Hun Sen on Friday rejected ...,cambodian leader hun sen friday reject opposit...


In [105]:
    maxes = []
    for i in tqdm(range(start,end,1)):
        try:
            sentences = data["Sentences"][i].split(ppSpecial)
            filtered_sentences = data["Filtered Sent"][i].split(ppSpecial)
            df = buildDF(filtered_sentences, sentences, useWeights, weights)
            df.to_csv("DUC Sentence Score by Document/Document " + str(i) + ".csv", index=False)
            maxes.append([df[Algorithms[0]].max(),df[Algorithms[1]].max(),df[Algorithms[2]].max(),df[Algorithms[3]].max(),df[Algorithms[4]].max(),df[Algorithms[5]].max()])
        except Exception as e:
            print("Error",i, e)

100%|██████████| 49/49 [42:22<00:00, 51.88s/it] 


In [109]:
df = pd.DataFrame(maxes, columns = Algorithms) 
df.to_csv("DUC Sentence Score by Document/max.csv", index=False)

In [110]:
absmax = [df[Algorithms[0]].max(),df[Algorithms[1]].max(),df[Algorithms[2]].max(),df[Algorithms[3]].max(),df[Algorithms[4]].max(),df[Algorithms[5]].max()]
absmax

[575.9200000000001,
 0.014408788634744468,
 293,
 0.35690590121592536,
 0.006081142725024658,
 0.7871403290503924]

## Normalization