In [1]:
%run summarizationBackbone.ipynb

Loading Big File...
Finished Loading Big File.


In [2]:
from preprocessing_algorithms import *
from tqdm import tqdm

In [3]:
ds = pd.read_csv("Duc_dataset_first_ref_summary.csv")

# Summarization Settings

In [60]:
# Dataset Column Names
articleCol = "Original Article"
summaryCol = "Original Summary"
# Documents to be summarized
start = 0
end = len(ds)
# Summarization Percentage [0-1]% if > 1 acts as number of sentences
percentage = 5
# Pre processing settings
lemmatization = True
remove_stopwords = True
# output file name [MAKE SURE TO CHANGE TO NOT OVERWRITE]
outFileName = "DUC_Weighted_1st_5Sent_lemT_swT.csv"
ppFileName = "PP_DUC_1st_5Sent_lemT_swT.csv"
ppSpecial = "#!@!#"
# Weights Settings
useWeights = True
weights = {"tm":0.92,"luhn":1,"lsa":0.44,"tr":0.84,"lex":0.88,"lda":0.8}

In [90]:
Algorithms = ["Tm","Lex","Luhn","Lsa","Tr","LDA"]
lstCols = get_combinations(Algorithms)
cols = ["Original Article","Original Summary"] + lstCols
df3 = pd.DataFrame(columns = cols)
df3.to_csv(outFileName, index=False)

## Pre Processing

In [16]:
sentences, processed_sentences = process_one_column_df(ds[articleCol],lemmatization,remove_stopwords)

100%|██████████| 49/49 [07:24<00:00,  9.08s/it]


In [62]:
collected = []
for i in range(len(sentences)):
    collected.append([ds[articleCol][i],ds[summaryCol][i],ppSpecial.join(sentences[i]),ppSpecial.join(processed_sentences[i])])

In [68]:
df = pd.DataFrame(collected, columns =[articleCol,summaryCol,"Sentences","Filtered Sent"]) 

In [69]:
df.to_csv(ppFileName, index=False)

In [70]:
data = pd.read_csv(ppFileName)

In [74]:
data.head()

Unnamed: 0,Original Article,Original Summary,Sentences,Filtered Sent
0,\nCambodian leader Hun Sen on Friday rejected ...,Prospects were dim for resolution of the polit...,\nCambodian leader Hun Sen on Friday rejected ...,cambodian leader hun sen friday reject opposit...
1,\nHonduras braced for potential catastrophe Tu...,Hurricane Mitch approached Honduras on Oct. 27...,\nHonduras braced for potential catastrophe Tu...,honduras brace potential catastrophe tuesday h...
2,\nCuban President Fidel Castro said Sunday he ...,Britain caused international controversy and C...,\nCuban President Fidel Castro said Sunday he ...,cuban president fidel castro sunday disagree a...
3,"\nMUNICH, Germany (AP) _ U.S. prosecutors have...",After the bombing of U.S. embassies in East Af...,"\nMUNICH, Germany (AP) _#!@!#U.S. prosecutors ...",munich germany ap#!@!#prosecutor ask 20 - day ...
4,\nIn a critical ruling for the North American ...,In a dispute over a new collective bargaining ...,\nIn a critical ruling for the North American ...,critical rule north american national basketba...


# one by one docs

In [80]:
def onebyoneSummarization(ds):
    summarizedDataset = []
    for i in tqdm(range(start,end,1)):
        try:
            sentences, filtered_sentences = preprocessing_text_with_spacy(ds[articleCol][i],lemmatization,remove_stopwords)
            df = buildDF(filtered_sentences, sentences, useWeights, weights)

            summarizedRow = {"Original Article": ds[articleCol][i],"Original Summary": ds[summaryCol][i]}
            for key in df.keys():
                element = summarizeWith(sentences, df, key, percentage)
                summarizedRow[key] = element
            pd.DataFrame.from_dict(summarizedRow, orient='index').T.to_csv(outFileName,mode='a',header=False,index=False)
            # break
        except Exception as e:
            print("Error",i, e)

In [76]:
def predict(text, percentage):
    try:
        sentences, filtered_sentences = preprocessing_text_with_spacy(text,True,False)
        df = buildDF(filtered_sentences, sentences)
        summarizedRow = {"Original Article": ds[articleCol][i]}
        for key in df.keys():
            element = summarizeWith(sentences, df, key, percentage)
            summarizedRow[key] = element
        return summarizedRow
    except Exception as e:
        print("Error",i, e)

## Doc Level

In [96]:
%run summarizationBackboneDocLvl.ipynb

Loading Big File...
Finished Loading Big File.


In [97]:
lstCols = Algorithms
data = pd.read_csv(ppFileName)

In [98]:
data.head(1)

Unnamed: 0,Original Article,Original Summary,Sentences,Filtered Sent
0,\nCambodian leader Hun Sen on Friday rejected ...,Prospects were dim for resolution of the polit...,\nCambodian leader Hun Sen on Friday rejected ...,cambodian leader hun sen friday reject opposit...


In [99]:
    for i in tqdm(range(start,end,1)):
        try:
            sentences = data["Sentences"][i].split(ppSpecial)
            filtered_sentences = data["Filtered Sent"][i].split(ppSpecial)
            df = buildDF(filtered_sentences, sentences, useWeights, weights)
            df.to_csv("Document " + i, index=False)
            break
        except Exception as e:
            print("Error",i, e)

  2%|▏         | 1/49 [00:22<17:43, 22.15s/it]

                                                  Sent      Tm       Lex  \
0    \nCambodian leader Hun Sen on Friday rejected ...   86.48  0.007123   
1    Government and opposition parties have asked \...  230.00  0.010135   
2    Opposition leaders Prince Norodom \nRanariddh ...   68.08  0.007310   
3                     Hun Sen, however, rejected that.    0.00  0.005800   
4    ``I would like to make \nit clear that all mee...   15.64  0.003379   
..                                                 ...     ...       ...   
183  The deal, which will make Hun Sen \nprime mini...  125.12  0.010797   
184  Sihanouk, recalling \nprocedures used in a pas...   57.96  0.002221   
185  The remaining senators, \nhe said, should be s...   39.56  0.003788   
186  Hun Sen said Monday that the CPP and FUNCINPEC...    1.84  0.007425   
187  Other details of the Senate, including how muc...   17.48  0.003327   

     Luhn       Lsa        Tr       LDA  
0     102 -0.072331  0.004755  0.757687  
1  

  4%|▍         | 2/49 [00:40<15:42, 20.05s/it]

                                                  Sent     Tm       Lex  Luhn  \
0    \nHonduras braced for potential catastrophe Tu...  23.92  0.005200   131   
1    President Carlos Flores Facusse declared a sta...  30.36  0.004171   102   
2    At 0900 GMT Tuesday, Mitch was 95 \nmiles (152...   8.28  0.005641    60   
3    With \nwinds near 180 mph (289 kph), and even ...  28.52  0.003787    40   
4      the highest, most dangerous rating for a storm.  19.32  0.004351    24   
..                                                 ...    ...       ...   ...   
169  Two British ships that were in the area on \na...  16.56  0.003780    68   
170  ``It's a coincidence that the ships \nare ther...  10.12  0.004395    85   
171  Nicaragua said Friday it will accept Cuba's of...  12.88  0.005621    79   
172  Nicaraguan leaders previously had refused \nCu...   5.52  0.002473    75   
173  Nicaragua's leftist Sandinistas, \nwho maintai...  10.12  0.003475   123   

          Lsa        Tr    

  4%|▍         | 2/49 [00:51<20:05, 25.66s/it]


KeyboardInterrupt: 