# First idea : label words as important if they are in the title 

The goal of the two ideas here is to present methods that transform the generation problem into classification problem (see report)

In [1]:
import re
from nltk.corpus import stopwords
import numpy as np
import csv
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import unicodedata

from __future__ import unicode_literals, print_function, division
from io import open
import string
import re
import random
from rouge_score import rouge_scorer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

In [None]:
data_encoding = 'utf-8'
data_train_ = []
with open('data/train.csv', 'r', encoding=data_encoding) as file:
    reader = csv.reader(file)
    for row in reader:
        data_train_.append(row)

data_test_ = []
with open('data/validation.csv', 'r', encoding=data_encoding) as file:
    reader = csv.reader(file)
    for row in reader:
        data_test_.append(row)

data_train = data_train_[1:]
data_test = data_test_[1:]

In [None]:
stop_words = stopwords.words('french')

def preprocess(text):
    text = text.lower() # Mettre en minuscule
    newtext = []
    text = text.split()
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = ' '.join(newtext)
    text = re.sub(r'\(.*\)','', text) # Supprimer les parenthèses et leur contenu
    text = re.sub(r'[^a-zA-Z0-9 àâäéèêëîïôöùûüç]', ' ', text) # Supprimer les ponctuations sauf les points
    text = re.sub(r'\s+', ' ', text) # Supprimer les espaces multiples

    return text

In [None]:
data_train_preprocessed = [(preprocess(x[0]), preprocess(x[1])) for x in data_train]
data_test_preprocessed = [(preprocess(x[0]), preprocess(x[1])) for x in data_test]

MAX_LENGTH_input = 300

for i in range(len(data_train_preprocessed)):
    x, y = data_train_preprocessed[i]
    word = x.split()[:MAX_LENGTH_input- 2]
    word = ['SOS'] + word + ['EOS']
    data_train_preprocessed[i] = (' '.join(word), y)

print(data_train_preprocessed[0])

In [None]:
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOS": 0, "EOS": 1, "PAD": 2, "UNK": 3}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "PAD", 3: "UNK"}
        self.word2count["SOS"] = 1000000000
        self.word2count["EOS"] = 100000000
        self.word2count["PAD"] = 10000000
        self.word2count["UNK"] = 1000000
        self.n_words = 4  # Count SOS and EOS and PAD and UNK
        self.threshold_input = 10

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def smaller_vocab(self, threshold):
        self.word2index = {k : v for k,v in self.word2index.items() if self.word2count[k] > threshold}
        #sort
        self.word2index = dict(sorted(self.word2index.items(), key=lambda item: item[1]))
        self.index2word = {v : k for k,v in self.word2index.items()}
        self.n_words = len(self.word2index.keys())

def readLangs(text):
    print("Reading lines...")
    
    input_lang = Lang(text)

    return input_lang

def prepareData(texts):
    input_lang = readLangs(texts)
    print("Counting words...")
    for text in texts:
        input_lang.addSentence(text)
    
    input_lang.smaller_vocab(input_lang.threshold_input)
    return input_lang

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]
    

In [None]:
x = [x[0] for x in data_train_preprocessed]
summary = [x[1] for x in data_train_preprocessed]

input_lang = prepareData(x)

print(input_lang.n_words)
print(data_train_preprocessed[2])

In [None]:
data_train_index_x = [indexesFromSentence(input_lang, x[0]) for x in data_train_preprocessed]

for i in range(len(data_train_index_x)):
    for j in range(len(data_train_index_x[i])):
        if data_train_index_x[i][j] >= input_lang.n_words:
            data_train_index_x[i][j] = UNK_token

#padding
for i in range(len(data_train_index_x)):
    data_train_index_x[i] = data_train_index_x[i] + [PAD_token] * (MAX_LENGTH_input - len(data_train_index_x[i]))

y = []
for i in range(len(data_train_index_x)):
    classification = []
    words = [input_lang.index2word[x] for x in data_train_index_x[i]]
    for word in words:
        if word in summary[i]:
            classification.append(1)
        else:
            classification.append(0)
    y.append(classification)

print(len(data_train_index_x[0]))
print(len(y[0]))

data_train_index_x_t = torch.tensor(data_train_index_x, dtype=torch.long, device=device)
data_train_index_y_t = torch.tensor(y, dtype=torch.float, device=device)

trainset = data.TensorDataset(data_train_index_x_t, data_train_index_y_t)
trainloader = data.DataLoader(trainset, batch_size=32, shuffle=True)

In [None]:
class WordClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.2):
        super(WordClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, bidirectional=True)  # Using bidirectional GRU
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)  # Doubling hidden_dim due to bidirectional
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded)
        hidden = self.dropout(output[:, -1, :])
        output = F.relu(self.fc1(hidden))
        output = self.dropout(output)
        output = self.fc2(output)
        return output

In [None]:
model = WordClassifier(input_lang.n_words, 30, 20, MAX_LENGTH_input).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(model, trainloader, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(trainloader):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            #clamp
            outputs = torch.clamp(outputs, 0, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print('[%d] loss: %.3f' % (epoch + 1, running_loss / len(trainloader)))

In [None]:
train(model, trainloader, optimizer, criterion, epochs=1000)

In [None]:
def evaluate(model, tensor, input_lang):
    model.eval()
    with torch.no_grad():
        output = model(tensor)
    return output

for i, data in enumerate(trainloader):
    inputs, labels = data
    outputs = evaluate(model, inputs, input_lang)
    print(outputs)
    print(labels)
    break

# Second idea : label sentences as important if they are the nearest (in term of rouge score) from the title

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
import torch.nn.functional as F

In [None]:
import csv
data_encoding = 'utf-8'
data_train_ = []
with open('data/train.csv', 'r', encoding=data_encoding) as file:
    reader = csv.reader(file)
    for row in reader:
        data_train_.append(row)

data_test_ = []
with open('data/validation.csv', 'r', encoding=data_encoding) as file:
    reader = csv.reader(file)
    for row in reader:
        data_test_.append(row)

data_train = data_train_[1:]
data_test = data_test_[1:]

In [None]:
import re
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

def find_best_sentence(data_train):
    text = data_train[0]
    title = data_train[1]
    sentences = re.split(r'[.]', text)
    best_sentence = ''
    best_score = 0
    idx = 0
    for i in range(len(sentences)):
        score = scorer.score(sentences[i], title)['rougeL'][2]
        if score > best_score:
            best_score = score
            best_sentence = sentences[i]
            idx = i
    return best_sentence, idx

In [None]:
from sentence_transformers import SentenceTransformer

# Charger un modèle pré-entraîné pour l'embedding de phrases
model_encodding = SentenceTransformer('distiluse-base-multilingual-cased')

In [None]:
import re
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

def find_best_sentence(data_train):
    text = data_train[0]
    title = data_train[1]
    sentences = re.split(r'[.]', text)
    best_sentence = ''
    best_score = 0
    idx = 0
    for i in range(len(sentences)):
        score = scorer.score(sentences[i], title)['rougeL'][2]
        if score > best_score:
            best_score = score
            best_sentence = sentences[i]
            idx = i
    return best_sentence, idx


In [None]:
x = []
y = []
for i in range(len(data_train[:10000])):
    if i % 100 == 0:
        print('Processing', i, 'of', len(data_train))
    sentences = re.split(r'[.]', data_train[i][0])
    best_sentence, idx = find_best_sentence(data_train[i])
    text_encode = model_encodding.encode(data_train[i][0])
    for j in range(len(sentences)):
        if j == idx:
            y.append(1)
        else:
            y.append(0)
        sentence_encode = model_encodding.encode(sentences[j])
        x.append([text_encode, sentence_encode])

In [None]:
x_t = torch.tensor(x)
y_t = torch.tensor(y)

dataset = data.TensorDataset(x_t, y_t)
dataloader = data.DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
#model that takes the sentences and the text and return the probability that the sentence is the best one
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.cnn_text = nn.Conv1d(3, 128, 3)
        self.cnn_sentence = nn.Conv1d(3, 128, 3)
        self.mlp = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, text, sentence):
        #repeter 3 fois text en dimension 1
        text = text.unsqueeze(1)
        text = text.repeat(1, 3, 1)
        sentence = sentence.unsqueeze(1)
        sentence = sentence.repeat(1, 3, 1)
        text = self.cnn_text(text)
        sentence = self.cnn_sentence(sentence)
        text = F.max_pool1d(text, text.size(2)).squeeze(2)
        sentence = F.max_pool1d(sentence, sentence.size(2)).squeeze(2)
        x = torch.cat((text, sentence), 1)
        x = self.mlp(x)
        return x

In [None]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    running_loss = 0
    for x, y in train_loader:
        text = x[:, 0].to(device)
        sentence = x[:, 1].to(device)
        y = y.to(device)
        optimizer.zero_grad()
        output = model(text, sentence)
        loss = criterion(output, y.float().view(-1, 1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

def test_validation(model, data_test, device):
    model.eval()
    mean_score = 0
    with torch.no_grad():
        for i in range(1000):
            sentences = re.split(r'[.]', data_test[i][0])
            text_encode = model_encodding.encode(data_test[i][0])
            text_encode = torch.tensor(text_encode).to(device)
            text_encode = text_encode.unsqueeze(0)
            predictions = []
            for j in range(len(sentences)):
                sentence_encode = model_encodding.encode(sentences[j])
                sentence_encode = torch.tensor(sentence_encode).to(device)
                sentence_encode = sentence_encode.unsqueeze(0)
                output = model(text_encode, sentence_encode)
                predictions.append(output.cpu().detach().numpy())
            idx_prediction = np.argmax(predictions)
            mean_score += scorer.score(sentences[idx_prediction], data_test[i][1])['rougeL'][2]
    return mean_score / 1000

In [None]:
model = Model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

best_mean_score = 0
for epoch in range(1000):
    loss = train(model, dataloader, optimizer, criterion, device)
    print('Epoch', epoch, 'Loss', loss)
    if epoch % 100 == 0:
        mean_score = test_validation(model, data_test, device)
        print('Mean score', mean_score)
        if mean_score > best_mean_score:
            best_mean_score = mean_score
            torch.save(model.state_dict(), 'model.pth')