<h1> imports

In [1]:
#file_name = "summary Input2.txt"
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np
import tqdm as tqdm

In [2]:
import math
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
from lexrank.utils.text import tokenize
from lexrank.mappings.stopwords import STOPWORDS

In [3]:
from datetime import datetime
def timeNow(text):
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(text, "Current Time =", current_time)

<h1> Pre Processing </h1>

In [4]:
def tokenization(text):
    sentences = sent_tokenize(text)
    filtered_sentences = remove_stop_words(sentences)
    return filtered_sentences,sentences

In [5]:
def remove_stop_words(sentences):
    # define a set of stop words 
    stop_words = set(stopwords.words('english'))
    filtered_sentences = []
    for sentence in sentences:
        # tokenize each sentence into words
        word_tokens = word_tokenize(sentence)
        # remove all stop words from each sentence
        filtered_words = [word for word in word_tokens if not word in stop_words]
        # join all filtered words back into a single sentence
        filtered_sentence = ' '.join(filtered_words)
        filtered_sentences.append(filtered_sentence)
    return filtered_sentences

<h1> Text Matching 

In [7]:
def text_matching(filtered_sentences, sentences):
    # timeNow("Starting Text Matching.")
    # summary_size = 5
    # f = open(file_name, "r")
    # text = ""
    # for line in f:
    #     text += line.replace('!','.').replace('?','.').replace('\n',' ')
    # f.close()
    
    # sentences = sent_tokenize(text)  # Tokenize the text into sentences
    # sentences = remove_stop_words(sentences)
    
    word_frequencies = {}  # Create an empty dictionary to store the word frequencies
    for sentence in filtered_sentences:  # Loop through each sentence in the text
        words = nltk.word_tokenize(sentence)  # Tokenize each sentence into words
        for word in words:  # Loop through each word in the sentence
            if word not in word_frequencies.keys():  # Check if the word is already in the dictionary
                word_frequencies[word] = 1  # If not, set its count to 1
            else:
                word_frequencies[word] += 1  # If yes, increment its count by 1
    summary = []  # Create an empty list to store the summary sentences
    sentence_map = {} # Create an empty dictionary to store the sentence scores
    sent_scores = []
    sent_index = 0
    for sentence in sentences:  # Loop through each sentence in the text again
        words = nltk.word_tokenize(sentence)  # Tokenize each sentence into words again
        score = 0  # Initialize a score variable to 0
        for word in words:  # Loop through each word in the sentence
            if word in word_frequencies.keys():  # Check if the current word is present in our dictionary of word frequencies
                score += word_frequencies[word]  # If yes, add its frequency to our score variable
        sentence_map[sent_index] = [sentence]
        sent_scores.append(score)
        sent_index += 1
    
    # scores = sorted(sentence_map.keys())
    # scores = scores[len(scores)-summary_size:]
    maxScore = max(sent_scores)    
    sortedScored = list(reversed(sorted(sent_scores)))
    
    for i in range(len(sent_scores)):
        sentence_map[i].append(sent_scores[i] / maxScore)
        sentence_map[i].append(sortedScored.index(sent_scores[i]))
    
    # for score in scores:
    #     summary.append(sentence_map[score])
        # print("Score :[",score,"] Sentence :", sentence_map[score])
    # timeNow("Finished Text Matching")
    return sentence_map

<h1> Luhn

In [8]:
def luhn_algorithm(filtered_sentences, sentences):
    # timeNow("Starting Luhn")
    # Initialize a list to store the summary
    summary = []
    # # Split the text into sentences
    # sentences = text.split('.')
    # Initialize a list to store the sentence scores
    sentence_scores = []
    sentence_map = {}
    sent_index = 0
    # Iterate through each sentence
    for sentence in filtered_sentences:
        # Split the sentence into words
        words = sentence.split()
        # Initialize a score for the sentence
        score = 0
        # Iterate through each word
        for word in words:
            # Calculate the score for the word
            score += len(word)
        # Add the score to the sentence scores list
        sentence_scores.append(score)
        sentence_map[sent_index] = [sentences[sent_index]]
        sent_index += 1

    maxScore = max(sentence_scores)
    sortedScored = list(reversed(sorted(sentence_scores)))
    
    for i in range(len(sentence_scores)):
        sentence_map[i].append(sentence_scores[i] / maxScore)
        sentence_map[i].append(sortedScored.index(sentence_scores[i]))
    # timeNow("Finished Luhn")
    return sentence_map

<h1> Latent Semantic Analysis

In [9]:
def create_tf_idf(filtered_sentences):
    tfidfconverter = TfidfVectorizer()
    X = tfidfconverter.fit_transform(filtered_sentences).toarray()
    return X

In [10]:
def lsa_algorithm(X):
    svdmodel = TruncatedSVD(n_components=2)
    svdmodel.fit(X)
    result = svdmodel.transform(X)
    return result

In [11]:
def lsa_summarization(filtered_sentences, sentences):
    # timeNow("Starting LSA")
    # sentences = tokenization(text)
    # filtered_sentences = remove_stop_words(sentences)
    X = create_tf_idf(filtered_sentences)
    result = lsa_algorithm(X)
    scores = result[:,1]
    summary = ""
    sentence_map = {}
    normalized = (scores-min(scores))/(max(scores)-min(scores))
    # summarize our text by selecting only those sentences with higher scores
    sortedScored = list(reversed(sorted(normalized)))
    for i in range (len (normalized)):
        sentence_map[i] = [sentences[i], normalized[i], sortedScored.index(normalized[i])]
    # timeNow("Finished LSA")
    return sentence_map

<h1> Text Rank

In [12]:
word_embeddings = {}

In [13]:
if not(word_embeddings):
    print("Loading Big File...")
    f = open('glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    print("Finished Loading Big File.")
def textRank(filtered_sentences, sentences):
    # timeNow("Starting Text Rank")
    sentence_vectors = []
    for i in filtered_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100))
        sentence_vectors.append(v)

    # similarity matrix
    sim_mat = np.zeros([len(sentences), len(sentences)])

    from sklearn.metrics.pairwise import cosine_similarity

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]   
    #begining Graph stap
    import networkx as nx
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    maxScore = max(scores.values())
    for key in scores:
        scores[key] = scores[key] / maxScore
    sent_map = {}
    sortedScores = list(reversed(sorted(scores.values())))
    for index in range(len(sentences)):
        sent_map[index] = [sentences[index] , scores[index], sortedScores.index(scores[index])]
    #ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    # timeNow("Finished Text Rank")
    return sent_map

Loading Big File...
Finished Loading Big File.


In [14]:
def createEnsembledic(tm_dic,luhn_dic,lsa_dic,tr_dic,lex_dic):
    ensemble_dic = {}
    finalScores = []
    for key in tm_dic:
        final_score = tm_dic[key][1] + luhn_dic[key][1]+ lsa_dic[key][1] + tr_dic[key][1] + lex_dic[key][1]
        finalScores.append(final_score)
        scores_arr = [tm_dic[key][1],luhn_dic[key][1],lsa_dic[key][1],tr_dic[key][1],lex_dic[key][1]]
        sent = tm_dic[key][0]
        ensemble_dic[key] = [final_score, scores_arr, sent]
    return ensemble_dic,finalScores

In [15]:
def ensembleSummary(ensemble_dic,finalScores,size):
    finalScores = list(reversed(sorted(finalScores)))[:]
    summaryScores = finalScores[:size] # number of summary sentences
    summary = []
    for key in ensemble_dic:
        if(ensemble_dic[key][0] in summaryScores):
            summary.append(ensemble_dic[key][2])
    return summary

<h1> Lex Rank </h1>

In [16]:
def LexRank_algorithm(filtered_sentences,sentences,size=5,threshold = 0.095):
    tfidfconverter = TfidfVectorizer()
    tf_idf = tfidfconverter.fit_transform(filtered_sentences).toarray()
    length = len(tf_idf)
    similarity_matrix = np.zeros([length] * 2)
    
    for i in range(length):
        for j in range(i, length):
            similarity = cosine_similarity(tf_idf[i],tf_idf[j],i,j)

            if similarity:
                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity    
    
    def get_summary(sentences,similarity_matrix,threshold,summary_size=1):

        if not isinstance(summary_size, int) or summary_size < 1:
            raise ValueError('\'summary_size\' should be a positive integer')

        lex_scores = rank_sentences(sentences,similarity_matrix,threshold)

        sorted_ix = np.argsort(lex_scores)[::-1]

        summary_index=[]
        for i in sorted_ix[:summary_size]:
            summary_index.append(i)
        #print(summary_index)
        return lex_scores,summary_index

    scores , summary_index = get_summary(sentences,similarity_matrix,threshold,size)

    sentence_map = {}
    normalized = (scores-min(scores))/(max(scores)-min(scores))
    sortedScored = list(reversed(sorted(normalized)))
    for i in range (len (normalized)):
        sentence_map[i] = [sentences[i], normalized[i], sortedScored.index(normalized[i])]
    return sentence_map

def connected_nodes(matrix):
    _, labels = connected_components(matrix)
    z = csr_matrix(matrix)
    groups = []
    for tag in np.unique(labels):
        group = np.where(labels == tag)[0]
        groups.append(group)
    return groups

def cosine_similarity(list_1, list_2,i,j):
        if i == j :
            return 1
        dot = np.dot(list_1, list_2)
        if math.isclose(dot, 0):
            return 0
        norm = (np.linalg.norm(list_1) * np.linalg.norm(list_2))
        cos_sim = dot / norm
        return cos_sim
    
    
def stationary_distribution(transition_matrix,normalized=True):
    n_1, n_2 = transition_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'transition_matrix\' should be square')

    distribution = np.zeros(n_1)
    grouped_indices = connected_nodes(transition_matrix)

    for group in grouped_indices:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix)
        distribution[group] = eigenvector
    if normalized:
        distribution /= n_1
    return distribution

def _power_method(transition_matrix):
    sentences_count = len(transition_matrix)
    eigenvector = np.ones(sentences_count)
    if len(eigenvector) == 1:
        return eigenvector
    transposed_matrix = transition_matrix.T
    lambda_val = 1.0

    while np.allclose(lambda_val, eigenvector):
        eigenvector_next = np.dot(transposed_matrix, eigenvector)
        lambda_val = np.linalg.norm(np.subtract(eigenvector_next, eigenvector))
        eigenvector = eigenvector_next
    return eigenvector


def create_markov_matrix(weights_matrix):
    n_1, n_2 = weights_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'weights_matrix\' should be square')

    row_sum = weights_matrix.sum(axis=1, keepdims=True)

    return weights_matrix / row_sum

def create_markov_matrix_discrete(weights_matrix, threshold):
    discrete_weights_matrix = weights_matrix#np.zeros(weights_matrix.shape)
    #print(discrete_weights_matrix)
    ixs = np.where(weights_matrix >= threshold)
    discrete_weights_matrix[ixs] = 1
    #print(discrete_weights_matrix)

    return create_markov_matrix(discrete_weights_matrix)

def degree_centrality_scores(similarity_matrix,threshold=None,increase_power=True):
    if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None')

    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)

    else:
        markov_matrix = create_markov_matrix_discrete(similarity_matrix,threshold)

    scores = stationary_distribution(markov_matrix,normalized=True)
    return scores

def rank_sentences(sentences,similarity_matrix,threshold=0.03):  
    scores = degree_centrality_scores(similarity_matrix,threshold)
    return scores

<h1> Summarizing the dataset

In [17]:
def extractSummaryFromDic(dic, size):
    summaryScores = []
    summarySent = []
    for i in range(size):
        summaryScores.append(dic[i][1])
        summarySent.append(dic[i][0])
        
    for i in range(size, len(dic), 1):
        if(dic[i][1] > min(summaryScores)):
            ind = summaryScores.index(min(summaryScores))
            summaryScores[ind] = dic[i][1]
            summarySent[ind] = dic[i][0]
    return summarySent

In [18]:
def getTextRanks(filtered_sentences, sentences):
    tm_dic = text_matching(filtered_sentences, sentences)
    luhn_dic = luhn_algorithm(filtered_sentences, sentences)
    lsa_dic = lsa_summarization(filtered_sentences, sentences)
    tr_dic = textRank(filtered_sentences, sentences)
    lex_dic = LexRank_algorithm(filtered_sentences, sentences)
    ensemble_dic, ensemble_scores = createEnsembledic(tm_dic, luhn_dic, lsa_dic, tr_dic, lex_dic)
    return ensemble_dic#, ensemble_scores, tm_dic, luhn_dic, lsa_dic, tr_dic

In [19]:
def allCombs(ensemble_dic):
    lst = []
    lstCols = []
    for i in ensemble_dic:
        ensemble = ensemble_dic[i][0]
        tm = ensemble_dic[i][1][0]
        luhn = ensemble_dic[i][1][1]
        lsa = ensemble_dic[i][1][2]
        tr = ensemble_dic[i][1][3]
        lex = ensemble_dic[i][1][4]
        
        lst.append([ensemble, tm, luhn, lsa, tr, lex,tm+luhn, tm+lsa, tm+tr, tm+lex, luhn+lsa, luhn+tr, luhn+lex, lsa+tr, lsa+lex, tr+lex,tm+luhn+lsa, tm+luhn+tr, tm+luhn+lex, tm+lsa+tr, tm+lsa+lex, tm+tr+lex, luhn+lsa+tr, luhn+lsa+lex, luhn+tr+lex, lsa+tr+lex,(ensemble-tm),(ensemble-luhn),(ensemble-lsa),(ensemble-tr),(ensemble-lex)])
    
    combinationScores = pd.DataFrame(lst, columns = ['ensemble', 'tm', 'luhn', 'lsa', 'tr', 'lex','tm luhn', 'tm lsa', 'tm tr', 'tm lex', 'luhn lsa', 'luhn tr', 'luhn lex', 'lsa tr', 'lsa lex', 'tr lex','tm luhn lsa', 'tm luhn tr', 'tm luhn lex', 'tm lsa tr', 'tm lsa lex', 'tm tr lex', 'luhn lsa tr', 'luhn lsa lex', 'luhn tr lex', 'lsa tr lex','luhn lsa tr lex','tm lsa tr lex','tm luhn tr lex','tm luhn lsa lex','tm luhn lsa tr'])     
    return combinationScores

In [20]:
def buildDF(filtered_sentences, sentences):
    ensemble_dic = getTextRanks(filtered_sentences, sentences)
    df = allCombs(ensemble_dic)
    return df

In [45]:
def summarizeWith(sentences, df, algorithm, percentage):
    if(percentage >= 1):
        sent = int(percentage)
    else:
        sent = int(percentage * len(df))
    summInds = df.nlargest(n=sent, columns=[algorithm])[algorithm].keys()
    # summInds = sorted(summInds)
    summ = ""
    for i in summInds:
        summ += sentences[i]
    return summ

In [None]:
onesCols = ['ensemble', 'tm', 'luhn', 'lsa', 'tr', 'lex']
twosCols = ['tm luhn', 'tm lsa', 'tm tr', 'tm lex', 'luhn lsa', 'luhn tr', 'luhn lex', 'lsa tr', 'lsa lex', 'tr lex']
threesCols = ['tm luhn lsa', 'tm luhn tr', 'tm luhn lex', 'tm lsa tr', 'tm lsa lex', 'tm tr lex', 'luhn lsa tr', 'luhn lsa lex', 'luhn tr lex', 'lsa tr lex']
foursCols = ['luhn lsa tr lex','tm lsa tr lex','tm luhn tr lex','tm luhn lsa lex','tm luhn lsa tr']