# BASE

In [1]:
# %run summarizationBackbone.ipynb
%run summarizationBackboneDocLvl.ipynb

Loading Big File...
Finished Loading Big File.


In [2]:
from preprocessing_algorithms import *
from tqdm import tqdm

In [None]:
def predict(text, percentage):
    try:
        sentences, filtered_sentences = preprocessing_text_with_spacy(text,True,False)
        df = buildDF(filtered_sentences, sentences)
        summarizedRow = {"Original Article": ds[articleCol][i]}
        for key in df.keys():
            element = summarizeWith(sentences, df, key, percentage)
            summarizedRow[key] = element
        return summarizedRow
    except Exception as e:
        print("Error",i, e)

In [4]:
ds = pd.read_csv("Duc_dataset_first_ref_summary.csv")

# Summarization Settings

In [5]:
# Dataset Column Names
articleCol = "Original Article"
summaryCol = "Original Summary"
# Documents to be summarized
start = 0
end = len(ds)
# Summarization Percentage [0-1]% if > 1 acts as number of sentences
percentage = 5
# Pre processing settings
lemmatization = True
remove_stopwords = True
# Normalization Type
isNormOnDataset = True 
# output file name [MAKE SURE TO CHANGE TO NOT OVERWRITE]
outFileName = "DUC_1st_5Sent_lemT_swT.csv"
ppFileName = "PP_DUC_1st_5Sent_lemT_swT.csv"
ppSpecial = "#!@!#"
# Weights Settings
useWeights = True
weights = {"tm":0.92,"luhn":1,"lsa":0.44,"tr":0.84,"lex":0.88,"lda":0.8}

In [6]:
Algorithms = ["Tm","Lex","Luhn","Lsa","Tr","LDA"]
lstCols = get_combinations(Algorithms)
cols = ["Original Article","Original Summary"] + lstCols
df3 = pd.DataFrame(columns = cols)
df3.to_csv(outFileName, index=False)

## Pre Processing

In [6]:
sentences, processed_sentences = process_one_column_df(ds[articleCol],lemmatization,remove_stopwords)

100%|██████████| 49/49 [08:33<00:00, 10.48s/it]


In [7]:
collected = []
for i in range(len(sentences)):
    collected.append([ds[articleCol][i],ds[summaryCol][i],ppSpecial.join(sentences[i]),ppSpecial.join(processed_sentences[i])])

In [8]:
df = pd.DataFrame(collected, columns =[articleCol,summaryCol,"Sentences","Filtered Sent"]) 

In [9]:
df.to_csv(ppFileName, index=False)

In [10]:
data = pd.read_csv(ppFileName)

In [11]:
data.head()

Unnamed: 0,Original Article,Original Summary,Sentences,Filtered Sent
0,\nCambodian leader Hun Sen on Friday rejected ...,Prospects were dim for resolution of the polit...,\nCambodian leader Hun Sen on Friday rejected ...,cambodian leader hun sen friday reject opposit...
1,\nHonduras braced for potential catastrophe Tu...,Hurricane Mitch approached Honduras on Oct. 27...,\nHonduras braced for potential catastrophe Tu...,honduras brace potential catastrophe tuesday h...
2,\nCuban President Fidel Castro said Sunday he ...,Britain caused international controversy and C...,\nCuban President Fidel Castro said Sunday he ...,cuban president fidel castro sunday disagree a...
3,"\nMUNICH, Germany (AP) _ U.S. prosecutors have...",After the bombing of U.S. embassies in East Af...,"\nMUNICH, Germany (AP) _#!@!#U.S. prosecutors ...",munich germany ap#!@!#prosecutor ask 20 - day ...
4,\nIn a critical ruling for the North American ...,In a dispute over a new collective bargaining ...,\nIn a critical ruling for the North American ...,critical rule north american national basketba...


# one by one docs

In [27]:
# def onebyoneSummarization(ds):
#     summarizedDataset = []
#     for i in tqdm(range(start,end,1)):
#         try:
#             # sentences, filtered_sentences = preprocessing_text_with_spacy(ds[articleCol][i],lemmatization,remove_stopwords)
#             # Read Pre Processed Data
#             data = pd.read_csv(ppFileName)
#             sentences = data["Sentences"][i].split(ppSpecial)
#             filtered_sentences = data["Filtered Sent"][i].split(ppSpecial)
#             # build combination dataframe
#             df = buildDF(filtered_sentences, sentences, useWeights, weights)

#             summarizedRow = {"Original Article": ds[articleCol][i],"Original Summary": ds[summaryCol][i]}
#             for key in df.keys():
#                 element = summarizeWith(sentences, df, key, percentage)
#                 summarizedRow[key] = element
#             pd.DataFrame.from_dict(summarizedRow, orient='index').T.to_csv(outFileName,mode='a',header=False,index=False)
#             # break
#         except Exception as e:
#             print("Error",i, e)

In [None]:
# onebyoneSummarization(ds)

100%|██████████| 49/49 [59:45<00:00, 73.17s/it]   


# Doc scoring

In [12]:
lstCols = Algorithms
data = pd.read_csv(ppFileName)

In [14]:
    maxes = []
    mines = []
    for i in tqdm(range(start,end,1)):
        try:
            sentences = data["Sentences"][i].split(ppSpecial)
            filtered_sentences = data["Filtered Sent"][i].split(ppSpecial)
            df = buildDF(filtered_sentences, sentences, useWeights, weights)
            df.to_csv("DUC Sentence Score by Document/Document " + str(i) + ".csv", index=False)
            maxes.append([df[Algorithms[0]].max(),df[Algorithms[1]].max(),df[Algorithms[2]].max(),df[Algorithms[3]].max(),df[Algorithms[4]].max(),df[Algorithms[5]].max()])
            mines.append([df[Algorithms[0]].min(),df[Algorithms[1]].min(),df[Algorithms[2]].min(),df[Algorithms[3]].min(),df[Algorithms[4]].min(),df[Algorithms[5]].min()])
        except Exception as e:
            print("Error",i, e)

100%|██████████| 49/49 [52:37<00:00, 64.44s/it] 


In [15]:
df = pd.DataFrame(maxes, columns = Algorithms) 
df.to_csv("DUC Sentence Score by Document/max.csv", index=False)
df = pd.DataFrame(mines, columns = Algorithms) 
df.to_csv("DUC Sentence Score by Document/min.csv", index=False)

In [16]:
maxdf = pd.read_csv("DUC Sentence Score by Document/max.csv")
mindf = pd.read_csv("DUC Sentence Score by Document/min.csv")

In [17]:
absmax = [maxdf[Algorithms[0]].max(),maxdf[Algorithms[1]].max(),maxdf[Algorithms[2]].max(),maxdf[Algorithms[3]].max(),maxdf[Algorithms[4]].max(),maxdf[Algorithms[5]].max()]
absmin = [mindf[Algorithms[0]].min(),mindf[Algorithms[1]].min(),mindf[Algorithms[2]].min(),mindf[Algorithms[3]].min(),mindf[Algorithms[4]].min(),mindf[Algorithms[5]].min()]

# Normalization

In [8]:
def normalize(values, min_value, max_value):
    normalized_values = [(x - min_value) / (max_value - min_value) for x in values]
    return normalized_values

Main Function for dataset level norm (Merged on next cell)

In [152]:
    # # dataset = []
    # for i in tqdm(range(start,end,1)):
    #     # try:
    #         # Normalization
    #         df = pd.read_csv("DUC Sentence Score by Document/Document "+ str(i) +".csv")
    #         norm2dlist = pd.DataFrame()
    #         norm2dlist["Sent"] = df["Sent"]
    #         for key in range(len(Algorithms)):
    #             norm2dlist[Algorithms[key]] = normalize(df[Algorithms[key]],absmin[key],absmax[key])
    #         # Combinations  
    #         combdf = allCombsDf(norm2dlist)
    #         # Summarization
    #         summarizedRow = {"Original Article": ds[articleCol][i],"Original Summary": ds[summaryCol][i]}
    #         for key in combdf.keys():
    #             element = summarizeWith(df["Sent"], combdf, key, percentage)
    #             summarizedRow[key] = element
    #         pd.DataFrame.from_dict(summarizedRow, orient='index').T.to_csv(outFileName,mode='a',header=False,index=False)
    #     # except Exception as e:
    #     #     print("Error",i, e)
    #     #     break
    #     # dataset.append(norm2dlist)

# Main Function

In [11]:
    # dataset = []
    for i in tqdm(range(start,end,1)):
        # try:
            # Normalization
            df = pd.read_csv("DUC Sentence Score by Document/Document "+ str(i) +".csv")
            norm2dlist = pd.DataFrame()
            norm2dlist["Sent"] = df["Sent"]
            for key in range(len(Algorithms)):
                if(isNormOnDataset):
                    normMax = absmax[key]
                    normMin = absmin[key]
                else:
                    normMax = df[Algorithms[key]].max()
                    normMin = df[Algorithms[key]].min()
                norm2dlist[Algorithms[key]] = normalize(df[Algorithms[key]],normMin,normMax)
            # Combinations  
            combdf = allCombsDf(norm2dlist)
            # Summarization
            summarizedRow = {"Original Article": ds[articleCol][i],"Original Summary": ds[summaryCol][i]}
            for key in combdf.keys():
                element = summarizeWith(df["Sent"], combdf, key, percentage)
                summarizedRow[key] = element
            pd.DataFrame.from_dict(summarizedRow, orient='index').T.to_csv(outFileName,mode='a',header=False,index=False)
        # except Exception as e:
        #     print("Error",i, e)
        #     break
        # dataset.append(norm2dlist)

100%|██████████| 49/49 [00:13<00:00,  3.67it/s]
