In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

To make this code work, the Google word2vec model must be downloaded : GoogleNews-vectors-negative300.bin.gz : https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

### Pretreatment

In [2]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/validation.csv')


In [3]:
big_train = pd.concat([train, val])


In [5]:
# Load pre-trained word2vec model
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)


In [6]:
article_vectors = big_train['text'].apply(lambda x: np.mean([model[word] for word in x.split() if word in model.key_to_index], axis=0))
article_vectors_2d = np.vstack(article_vectors)
title_vectors = big_train['titles'].apply(lambda x: np.mean([model[word] for word in x.split() if word in model.key_to_index], axis=0))

In [7]:
X_train = train['text'].apply(lambda x: np.mean([model[word] for word in x.split() if word in model.key_to_index], axis=0))
y_train = train['titles'].apply(lambda x: np.mean([model[word] for word in x.split() if word in model.key_to_index], axis=0))
X_test = val['text'].apply(lambda x: np.mean([model[word] for word in x.split() if word in model.key_to_index], axis=0))
y_test = val['titles'].apply(lambda x: np.mean([model[word] for word in x.split() if word in model.key_to_index], axis=0))

In [8]:
#reg = LinearRegression().fit(np.vstack(X_train.values), np.vstack(y_train.values))
reg = MultiOutputRegressor(SVR()).fit(np.vstack(X_train.values), np.vstack(y_train.values))


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt

# Prédire les valeurs pour le jeu de test
y_pred = reg.predict(np.vstack(X_test.values))

# Calculer les métriques d'évaluation
mae = mean_absolute_error(np.vstack(y_test), y_pred)
mse = mean_squared_error(np.vstack(y_test), y_pred)
rmse = sqrt(mse) # ou np.sqrt(mse)
r2 = r2_score(np.vstack(y_test), y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² score: {r2}")

Mean Absolute Error (MAE): 0.026490344669608454
Mean Squared Error (MSE): 0.0011706317107680038
Root Mean Squared Error (RMSE): 0.034214495623463514
R² score: -0.008839216376161859


### Best sentence prediction

In [10]:
title_embeddings_pred = reg.predict(np.vstack(X_test.values))

In [11]:
best_sentences = []
for i, text in enumerate(val['text']):
    # Calculer l'embedding pour chaque phrase dans l'article
    sentence_embeddings = []
    for sentence in text.split('.'):
        words = [model[word] for word in sentence.split() if word in model.key_to_index]
        if words:  # Vérifier que la liste n'est pas vide
            embedding = np.mean(words, axis=0)
            # Normaliser l'embedding par la longueur de la phrase
            normalized_embedding = embedding / len(sentence)
            sentence_embeddings.append(normalized_embedding)
    
    if sentence_embeddings:  # Vérifier que la liste n'est pas vide
        # Calculer la similarité cosinus entre l'embedding du titre prédit et l'embedding de chaque phrase
        similarities = cosine_similarity(title_embeddings_pred[i].reshape(1, -1), sentence_embeddings)
        
        # Sélectionner l'index de la phrase qui a la plus grande similarité cosinus
        best_sentence_index = np.argmax(similarities)
        
        # Ajouter la meilleure phrase à la liste
        best_sentences.append(text.split('.')[best_sentence_index])
    else:
        # Si aucune phrase ne contient de mots dans model.key_to_index, ajouter une chaîne de caractères vide
        best_sentences.append('')

# Ajouter les meilleures phrases au dataframe de validation
val['best_sentence'] = best_sentences

In [12]:
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [13]:
scores = [scorer.score(title, best_sentence) for title, best_sentence in zip(val['titles'], val['best_sentence'])]

# Calculer les scores moyens
mean_scores = {key: np.mean([score[key].fmeasure for score in scores]) for key in scores[0].keys()}

print(mean_scores)

{'rougeL': 0.1550764405516925}


In [14]:
def get_embedding(a_ngram):
    words = a_ngram.split()
    embeddings = [model[word] for word in words if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)


In [15]:
val['text'].values

array(["Sur les réseaux sociaux, les images sont impressionnantes. Dimanche matin à Venise, l'équipage du MSC Opéra a perdu le contrôle du paquebot, à son arrivée dans le port de la cité des Doges. Le navire, qui peut contenir plus de 2.600 passagers, est venu heurter le quai auquel il voulait s'arrimer. Le paquebot a raclé le quai sur plusieurs mètres, suscitant la panique des personnes à terre, avant de percuter un autre bateau touristique, le Michelangelo, stoppant ainsi sa course. Des témoins ont filmé la scène. Les vidéos montrent des touristes courant pour tenter de fuir le paquebot, qui ne semble pas vouloir s'arrêter. Quatre personnes ont été blessées dans cet accident : deux légèrement, tandis que les deux autres ont été transportées à l'hôpital pour des examens. L'incident s'est produit à San Basilio-Zaterre, dans le canal de la Giudecca, où de nombreux navires de croisière s'arrêtent pour permettre à leurs passagers de visiter Venise.Selon le quotidien italien Corriere della

### Grid search

In [16]:
from nltk.util import ngrams


In [17]:
def get_embedding(ngram):
    words = ngram
    embeddings = [model[word] for word in words if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

In [18]:
def traiter(liste,n):
    new_list = [liste[0]]
    for candidat in liste[1:]:
        if abs(candidat-new_list[-1])>n:
            new_list.append(candidat)
    return new_list

In [19]:
traiter([1,2,55],3)

[1, 55]

In [20]:
def k_meilleurs_ngrams(k,n):
    n = 4  # pour les bigrammes
    articles_words = [[sentence.split() for sentence in article.split('.')] for article in val['text'].values]
    article_ngrams = [[[ngram for ngram in ngrams(words, n)] for words in article] for article in articles_words]
    title_embeddings = reg.predict(np.vstack(X_test.values))
    article_ngram_embeddings = [[[get_embedding(ngram) for ngram in sentence] for sentence in article] for article in article_ngrams]
    similarities = [[[cosine_similarity([ngram_embedding], [title_embedding]) for ngram_embedding in sentence] for sentence in article] for article, title_embedding in zip(article_ngram_embeddings, title_embeddings)]
    flat_similarities = [np.array([sim for sentence in article for sim in sentence]) for article in similarities]
    flat_similarities2=[]
    for article in flat_similarities:
        flat_similarity2 = []
        for the_list in article:
            flat_similarity2.append(the_list[0][0])
        flat_similarities2.append(flat_similarity2)
    flat_articles_ngrams = [np.array([sim for sentence in article for sim in sentence]) for article in article_ngrams]
    best_ngrams_indices = []
    i=0
    for article_sim in flat_similarities2 :
        indices = np.argsort(article_sim)[-k:]
        best_ngrams_indices.append(indices)
    concatenation = []
    for i in range(0,len(best_ngrams_indices)):
        les_indices = best_ngrams_indices[i]
        concat = ''
        les_indices = traiter(les_indices,n)
        for indice in les_indices :
            concat += ' '.join(flat_articles_ngrams[i][indice])
            concat += '. '
        concatenation.append(concat)
    return(concatenation)
    


In [21]:
def score(k,n):
    scores = [scorer.score(title, best_sentence) for title, best_sentence in zip(val['titles'], k_meilleurs_ngrams(k,n))]
    mean_scores = {key: np.mean([score[key].fmeasure for score in scores]) for key in scores[0].keys()}
    return(mean_scores['rougeL'])


In [35]:
best_score = -float('inf')
best_k, best_n = None, None

for k in range(1, 7):
    for n in range(1, 7):
        current_score = score(k, n)
        if current_score > best_score:
            print(current_score)
            best_score = current_score
            best_k, best_n = k, n

print(f"La meilleure valeur de score est {best_score} avec k = {best_k} et n = {best_n}")

0.07448655981309939
0.10407554334497263
0.12282542913020117
0.13361844033354958
0.14130069967109968
0.14606026356860674
La meilleure valeur de score est 0.14606026356860674 avec k = 6 et n = 1
