In [13]:
import csv
from operator import itemgetter
import pandas as pd
from rouge_score import rouge_scorer
import numpy as np
import networkx as nx
# Initialize Rouge Scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data_encoding = 'utf-8'

data_test_ = []
with open('data/validation.csv', 'r', encoding=data_encoding) as file:
    reader = csv.reader(file)
    for row in reader:
        data_test_.append(row)

data_test = data_test_[1:]

# First idea : use the first sentence

In [9]:
rouge_scores = []

# Function that generates summaries using LEAD-N
def lead_summary(text):
    sentences = text.split('.')
    return sentences[0] + '.'

for i in range(len(data_test)):
    row = data_test[i][0]
    summary = lead_summary(row)
    scores = scorer.score(summary, data_test[i][1])['rougeL'][2]
    rouge_scores.append(scores)

print("Average Rouge-L Score: ", sum(rouge_scores)/len(rouge_scores))

Average Rouge-L Score:  0.1535873817959459


# Second idea : use td-idf to find an important sentence

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def reformulate_sentences(text, top_words, window_size=2):
    # Tokenizing the sentences
    sentences = re.split(r'[.!?]', text)
    
    reformulated_sentences = []
    sentence_positions = []
    total_length = 0
    
    for sentence in sentences:
        # Checking if the sentence contains any of the important words
        if any(word in sentence.lower() for word, _ in top_words):
            # Extracting words from the sentence
            words = sentence.split()
            # Finding indices of important words in the sentence
            important_word_indices = [i for i, word in enumerate(words) if word.lower() in [word for word, _ in top_words]]
            # Reformulating the sentence by highlighting the important words
            for index in important_word_indices:
                start_index = max(0, index - window_size)
                end_index = min(len(words), index + window_size + 1)
                reformulated_sentence = ' '.join(words[start_index:end_index])
                reformulated_sentences.append(reformulated_sentence)
                sentence_positions.append(total_length + index)
            # Removing words already processed
            top_words = [(word, score) for word, score in top_words if word.lower() not in sentence.lower()]

            total_length += len(words)

    # Cleaning to avoid redundancies
    fusion_pre = ' '.join(reformulated_sentences)
    words = fusion_pre.split()
    words = [word.lower() for word in words]
    words = list(set(words))
    words_flag = {word: 0 for word in words}
    fusion = ''
    for word in fusion_pre.split():
        if words_flag[word.lower()] == 0:
            fusion += word + ' '
            words_flag[word.lower()] = 1
    
    return fusion

def gather_previous_code(data_train, window_size=2, number_words=10):
    score_average = 0
    vectorizer = TfidfVectorizer()
    vectorizer.fit([x[0] for x in data_train])
    for text, summary in data_train:
        tokens = word_tokenize(text.lower()) 
        tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
        cleaned_text = ' '.join(tokens)
        tfidf_matrix = vectorizer.transform([cleaned_text])
        feature_names = vectorizer.get_feature_names_out()
        tfidf_scores = tfidf_matrix.toarray().flatten()
        word_scores = list(zip(feature_names, tfidf_scores))
        sorted_word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)
        top_words = sorted_word_scores[:number_words]
        fusion = reformulate_sentences(text, top_words, window_size)
        score = scorer.score(fusion, summary)['rougeL'][2]
        score_average += score
        best_score = 0
        
        for sentence in re.split(r'[.!?]', text):
            score = scorer.score(sentence, summary)['rougeL'][2]
            if score > best_score:
                best_score = score
    return score_average / len(data_train)

gather_previous_code(data_test[:100], window_size=4, number_words=15)

0.16814216784723096

# third idea : use TextRank

In [None]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [None]:
def remove_stopwords(sen):
    stop_words = stopwords.words('french')
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

def rank_sentences(text, word_embeddings):
    sentences = re.split(r'[.]', text)

    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    # similarity matrix
    sim_mat = np.zeros([len(sentences), len(sentences)])

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    try:
        nx_graph = nx.from_numpy_array(sim_mat)
        scores = nx.pagerank(nx_graph, max_iter=1000)
        #ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
        ranked_sentences = ((scores[i],s) for i,s in enumerate(sentences))
        # Extract top 10 sentences as the summary
        return ranked_sentences, True
    except:
        return [], False
    
for i in range(5):
    text = data_test[i][0]
    title = data_test[i][1]
    ranked_sentences = rank_sentences(text, word_embeddings)
    for i in range(5):
        if len(ranked_sentences) > i:
            score_ranked = scorer.score(title, ranked_sentences[i][1])['rougeL'][2]
            print(score_ranked)
    print()