# Data Preparation

Prepare data for modeling

### Import librairies

In [1]:
import pandas as pd # for data manipulation
import plotly.express as px # for plotting

import spacy # for lemming, parsing etc
import re # for regex

# sklearn
from sklearn.model_selection import train_test_split

# for dl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl


### Configurations

In [2]:
DATA_FOLDER = '../data'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Loading data

In [5]:
# load data and drop na
data = pd.read_csv(f'{DATA_FOLDER}/orangesum.csv')
data = data.dropna()

In [6]:
data.head()

Unnamed: 0,article,date,heading,title
0,"Cette enveloppe inclut 8,5 millions du ministè...",2020-07-02,L'État va mobiliser en 2020 environ 10 million...,Pastoralisme et ours: 10 millions d'euros en 2...
1,"205 personnes sont décédées sur les routes, so...",2020-06-15,Le nombre de personnes tuées sur les routes a ...,"Sécurité routière: baisse de 15,6% du nombre d..."
2,La police privilégie l'accident domestique.Con...,2017-11-18,Un petit garçon d'un an a trouvé la mort chez ...,Rennes : un accident domestique consternant
3,"Appelés pour un feu sur un parking, ils ont ex...",2017-11-22,FAIT DIVERS. Les pompiers de l'Aude ont fait u...,Aude : un cadavre en feu découvert sur un parking
4,La société italienne Spice-X s'est fixée pour ...,2018-08-07,7 août,Spice-X SX1 : la voiture de course électrique ...


In [7]:
# some exploration
INDEX = 777
print( 'Article ======> ', data.iloc[INDEX]['article'])
print( '\n Headline ======> ', data.iloc[INDEX]['heading'])
print( '\nTitle ======> ', data.iloc[INDEX]['title'])





#### Show distribution of length article

In [9]:
data['article_length'] = data['article'].apply(lambda x: len(x.split()))
data['heading_length'] = data['heading'].apply(lambda x: len(x.split()))
data['title_length'] = data['title'].apply(lambda x: len(x.split()))

In [14]:
fig = px.histogram(data, x = 'article_length')
fig.show()

In [15]:
fig = px.histogram(data, x = 'title_length')
fig.show()

In [16]:
fig = px.histogram(data, x = 'heading_length')
fig.show()

In [17]:
# we will use heading as the input and summarize it to title

data['heading'].sample(n =10).to_numpy().tolist()

['Le Premier ministre est revenu jeudi soir sur le plateau de TF1 sur la réforme des retraites, dont il avait précisé le calendrier quelques heures plus tôt. "Il faut dire la vérité aux Français : compte tenu de la réalité démographique, nous allons travailler un peu plus longtemps que ce soit avec la durée de cotisation ou l\'âge pivot", a-t-il insisté.',
 'Les gérants du Pont du Gard ont décidé de venir en aide à la cathédrale Notre-Dame de Paris. Durant deux jours, la totalité des entrées payées par les visiteurs seront reversées à un organisme chargé de reconstruire la cathédrale parisienne.',
 'La Corée du Nord a discuté de nouvelles mesures visant à renforcer sa "dissuasion nucléaire" au cours d\'une réunion présidée par son dirigeant Kim Jong Un, a annoncé dimanche l\'agence de presse officielle KCNA.',
 "Le site reopen.europa.eu est mis en ligne ce lundi par l'Union européenne pour permettre aux Européens de savoir dans quel pays ils peuvent voyager.",
 "Gérald Thomassin a disp

In [18]:
# subset only heading and title of certain length

orange_df = data[ (data['heading_length'] >= 5) & (data['title_length'] >= 5) & (data['heading_length'] < 80) ]
orange_df.shape

(29171, 7)

### Preprocessing

The steps are the following:

* Parse with spacy
* lower all the text
* remove special characters and punctuation
* remove digits
* remove stopwords
* remove short words

In [3]:
# load the spacy french parser
french_spacy = spacy.load('fr_core_news_md')
stopwords = french_spacy.Defaults.stop_words

In [75]:
def clean_text(text):

    # remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # remove digits
    text = re.sub('\d', '', text)

    # remove multiple spaces
    text = re.sub(' +', ' ', text)

    # strip leading and trailing spaces
    text = text.strip()

    return text



In [76]:
# first cleaning
orange_df['heading_clean'] = orange_df['heading'].apply(clean_text)

orange_df['title_clean'] = orange_df['title'].apply(clean_text)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [77]:
# create a col tokens in the dataframe with the tokens parsed by spacy
orange_df['heading_spacy_tokens'] = orange_df['heading_clean'].apply(lambda x : french_spacy(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [78]:
orange_df['title_spacy_tokens'] = orange_df['title_clean'].apply(lambda x : french_spacy(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [80]:
# save
orange_df.to_csv(f'{DATA_FOLDER}/orange_sum_spacy.csv', index = False)

#### Clean the text

In [110]:
def post_process_text(text, remove_stop_words = True, short_words_length = 2):


    # remove stopwords
    if remove_stop_words:
        text = [ token for token in text if token.lemma_ not in stopwords ]


    # remove short words
    text = [ token for token in text if len(token.lemma_) > short_words_length ]

    # replace entity
    for token in text:

        if token.ent_type_:
            token.lemma_ = token.ent_type_

    return ' '.join([ token.lemma_ for token in text ])





In [111]:
s = clean_text(orange_df['heading_clean'].iloc[7017])
print(s)
post_process_text(french_spacy(s))

Ces glaces baptisées La Mémère seront origine France garantie comme le sont les pots de miel produits par l ancien ministre de l Économie socialiste


'glace baptiser MISC origine LOC garantir pot miel produire ancien ORG ORG ORG'

In [113]:
s = orange_df['title_clean'].iloc[7017]
print(s)
post_process_text(french_spacy(s))

Après le miel l ancien ministre Arnaud Montebourg va lancer ses glaces au lait


'miel ancien ministre PER PER aller lancer glace lait'

In [115]:
orange_df['title_preprocessed'] = orange_df['title_spacy_tokens'].apply(lambda x : post_process_text(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [116]:
orange_df['heading_preprocessed'] = orange_df['heading_spacy_tokens'].apply(lambda x : post_process_text(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [117]:
# save to csv
orange_df.to_csv(f'{DATA_FOLDER}/orange_preprocessed.csv', index = False)

In [118]:
# read from file
#orange_df = pd.read_csv(f'{DATA_FOLDER}/orange_preprocessed.csv')

# subset on 5000 items
N_ITEMS = 5000
orange_df = orange_df.sample(N_ITEMS)

### Build Languages dictionary

In [286]:
UNKNOW_TOKEN = '_UNK_'
SOS_TOKEN = '_SOS_'
EOS_TOKEN = '_EOS_'


class Lang:

    def __init__(self, name : str):
        self.word_index = {UNKNOW_TOKEN : 0, SOS_TOKEN : 1, EOS_TOKEN : 2}
        self.index_word = {0 : UNKNOW_TOKEN, 1 : SOS_TOKEN, 2 : EOS_TOKEN}
        self.word_document_frequency = {}
        self.word_sentence_frequency = []
        self.word_count = {}
        self.name = name
        self.vocab = {UNKNOW_TOKEN : 0, SOS_TOKEN : 1, EOS_TOKEN : 2}

    def add_sentence(self, sentence):



        # add word index to dictionnary
        for word in sentence.split():

            # update the word index
            if word not in self.word_index:
                self.word_index[word] = len(self.word_index)
                self.index_word[len(self.word_index)] = word
                self.word_count[word] = 1
            else:
                self.word_count[word] = self.word_count[word] + 1


        # update document frequency dict
        for word in set(sentence.split()):
            if word not in self.word_document_frequency:
                self.word_document_frequency[word] = 1
            else:
                self.word_document_frequency[word] += 1

    def read_data_set(self, sentence_list):

        for sent in sentence_list:
            self.add_sentence(sent)

    def set_max_vocab_size(self, max_vocab_size):
        df = pd.DataFrame( self.word_count.items() )
        df = df.sort_values( by = 1, ascending = False )[:max_vocab_size][0].tolist()
        vocab_length = len(self.vocab)
        self.vocab = {UNKNOW_TOKEN : 0, SOS_TOKEN : 1, EOS_TOKEN : 2}
        tmp_vocab = { x :(i+vocab_length) for i,x in enumerate(df) }
        self.vocab.update(tmp_vocab)

In [287]:
# heading language
heading_lang = Lang('heading_lang')

# title language
title_lang = Lang('title_lang')

In [288]:

# build heading vocab
heading_lang.read_data_set(orange_df['heading_preprocessed'])

# build title vocab
title_lang.read_data_set(orange_df['title_preprocessed'])

##### Inspect the vocabulary

In [289]:
df = pd.DataFrame(title_lang.word_document_frequency.items()).sort_values(by = 1, ascending = False)
df

Unnamed: 0,0,1
2,LOC,2095
26,PER,2082
7,MISC,923
19,ORG,666
32,faire,167
...,...,...
3293,interopérabilité,1
3297,apaiser,1
3300,électrocuté,1
3301,bain,1


In [290]:
df[ df[1] > 5].shape # 1000 for title vocab size is reasonable

(918, 2)

In [291]:
df = pd.DataFrame(heading_lang.word_document_frequency.items()).sort_values(by = 1, ascending = False)
df

Unnamed: 0,0,1
11,LOC,2670
30,PER,2199
8,ORG,1758
9,MISC,1255
81,faire,537
...,...,...
5419,broder,1
5420,finement,1
315,moniteur,1
5424,dessu,1


In [292]:
df[ df[1] > 7].shape # 1500 for heading vocab size is reasonable

(1701, 2)

In [293]:
# set the vocabulary size
heading_lang.set_max_vocab_size(1500)
title_lang.set_max_vocab_size(1000)

In [295]:
len(title_lang.vocab)

1003

### Prepare data for training

In [296]:
def prepare_data(df, input_vocab = 1000, output_vocab = 512):

    heading_lang = Lang('headline_lang')
    title_lang = Lang('title_lang')

    heading_lang.read_data_set(df['heading_preprocessed'])
    heading_lang.set_max_vocab_size(input_vocab)

    title_lang.read_data_set(df['title_preprocessed'])
    title_lang.set_max_vocab_size(output_vocab)

    df['tokens_pairs'] = df[['heading_preprocessed', 'title_preprocessed']].apply(lambda x : ( [ heading_lang.vocab[el] if el in heading_lang.vocab else heading_lang.vocab[UNKNOW_TOKEN] for el in x[0].split()] , [title_lang.vocab[SOS_TOKEN]] + [ title_lang.vocab[el] if el in title_lang.vocab else title_lang.vocab[UNKNOW_TOKEN] for el in x[1].split() ] + [ title_lang.vocab[EOS_TOKEN] ] ), axis = 1)

    return orange_df, heading_lang, title_lang

In [297]:
orange_df, heading_lang, title_lang = prepare_data(orange_df)
orange_df.sample(10)[['heading_preprocessed','tokens_pairs']]

Unnamed: 0,heading_preprocessed,tokens_pairs
15711,gouvernement américain tenter lundi renforcer ...,"([25, 38, 165, 12, 765, 269, 224, 4, 4, 4, 4, ..."
2420,wow chemisier vraiment transparent vêtement as...,"([0, 0, 0, 0, 0, 0, 276, 21, 0, 0, 0, 0, 337, ..."
28539,jeune campeur prendre élu frapper fêtard année...,"([42, 0, 29, 173, 332, 0, 33, 27, 331, 58, 4, ..."
29505,police écossais tuer vendredi suspect attaque ...,"([59, 0, 111, 14, 328, 196, 4, 7, 385, 267, 13..."
3050,ORG ORG prendre mercredi distance quota immigr...,"([5, 5, 29, 11, 992, 0, 0, 284, 70, 30, 118, 2..."
30821,ORG annoncer jeudi décision faire travailler s...,"([5, 10, 8, 200, 7, 478, 531, 0, 930, 39, 0], ..."
25723,youtubeur spécialiser fitnes accuser harceler ...,"([0, 0, 0, 127, 0, 74, 0, 0, 15, 0, 0, 176, 17..."
14069,dépense mairie LOC frais avocat huissier élevé...,"([0, 259, 4, 0, 362, 0, 954, 80, 69, 3, 3, 41,..."
11363,jour grève transport commun francilien situati...,"([26, 464, 430, 898, 0, 219, 693, 359, 4, 4], ..."
23737,monde hypnotiser coronavirus maladie infectieu...,"([96, 0, 22, 212, 0, 585, 111, 93, 78, 50, 0],..."


In [298]:

HEADING_VOCAB_SIZE = len(heading_lang.vocab)
TITLE_VOCAB_SIZE = len(title_lang.vocab)

### Dataset and data module

In [351]:
# dataset and datamodule
class SummarizationDataset(Dataset):

    def __init__(self, df):
        super().__init__()
        self.pairs = df['tokens_pairs'].tolist()

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return torch.tensor(self.pairs[idx][0]).view(1,-1), torch.tensor(self.pairs[idx][1]).view(1,-1)

class SummarizationDataModule(pl.LightningDataModule):

    def __init__(self, df, test_size = 0.2, batch_size = 512, num_workers = 2, random_seed = 77):
        super().__init__()

        self.batch_size = batch_size
        self.num_workers = num_workers


        self.train_df, self.test_df = train_test_split(df, test_size=test_size, random_state=random_seed)

        self.test_df, self.val_df = train_test_split(self.test_df, test_size=0.5, random_state=random_seed)

    def prepare_data(self):
        pass

    def setup(self, stage=None):

        if stage == 'fit' or stage is None:
            self.train_ds = SummarizationDataset(self.train_df)
            self.val_ds = SummarizationDataset(self.val_df)

        if stage == 'test' or stage is None:
            self.test_ds = SummarizationDataset(self.test_df)

    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.batch_size, num_workers = self.num_workers)


    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size=self.batch_size, num_workers = self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size=self.batch_size, num_workers = self.num_workers)



In [419]:
# model

class EncoderSummarizer(nn.Module):

    def __init__(self, input_dim, hidden_dim, embedding_dim, batch_size = 64):
        super().__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)


    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded)

        return output, hidden

    def initHidden(self, batch_size = None):

        if batch_size is None:
            batch_size = self.batch_size

        return torch.zeros(batch_size,1,self.hidden_dim)



class DecoderSummarizer(nn.Module):

    def __init__(self, hidden_dim, output_dim, batch_size = 64):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.batch_size = batch_size

        self.embedding = nn.Embedding(self.output_dim, hidden_dim)
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim = 1)



    def forward(self, x, hidden):

        embedded = self.embedding(x)
        output, hidden = self.gru(embedded, hidden)
        result = self.fc(output[0])
        result = self.softmax(result)

        return result, hidden

    def initHidden(self, batch_size = None):

        if batch_size is None:
            batch_size = self.batch_size

        return torch.zeros(batch_size,1,self.hidden_dim)

In [383]:
class SummarizationModel(pl.LightningModule):

    def __init__(self, input_dim, hidden_dim, output_dim, embedding_dim, learning_rate = 1e-2, batch_size = 64, teacher_forcing_ratio = 0.5):

        super().__init__()

        # save some variables
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.learning_rate = learning_rate
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.teacher_forcing_ratio = teacher_forcing_ratio

        # models
        self.encoder = EncoderSummarizer(input_dim, hidden_dim, embedding_dim, batch_size = batch_size)
        self.decoder = DecoderSummarizer(hidden_dim, output_dim, batch_size = batch_size)

    def forward(self, x, y):
        
        pass

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat, loss = self(x, y)







In [384]:
# experiments
ds = SummarizationDataset(orange_df)

In [385]:
t_i, t_o = next(iter(ds))

print(f'shape of t_i = {t_i.size()},  shape of t_o = {t_o.size()}')

shape of t_i = torch.Size([1, 26]),  shape of t_o = torch.Size([1, 8])


In [386]:
encoder = EncoderSummarizer(input_dim=HEADING_VOCAB_SIZE, embedding_dim = 32, hidden_dim =8)

In [395]:
h = encoder(t_i)[1]

In [388]:
t_o

tensor([[ 1,  4,  0,  0, 47,  0,  0,  2]])

In [389]:
decoder = DecoderSummarizer(hidden_dim=8, output_dim= TITLE_VOCAB_SIZE )

In [391]:
tmp = decoder.embedding(t_o)

In [410]:
h_tmp = decoder.gru(tmp[:, 0, :].view(1, 1, -1), h)[1]

In [429]:
topv, topi = decoder.softmax( decoder.fc(h_tmp) ).topk(1)

In [431]:
topi.squeeze()

tensor(0)

In [427]:
decoder.fc(h_tmp)

tensor([[[ 8.5158e-02,  8.8458e-02,  2.1126e-01,  4.5799e-01, -3.6502e-01,
          -4.8931e-01, -1.6441e-01, -2.1019e-01, -1.6783e-01,  9.2363e-02,
          -6.6026e-02, -1.1161e-01,  1.2115e-01,  1.1413e-01,  9.6218e-02,
          -6.1996e-01, -7.8686e-02,  5.3014e-02, -2.7082e-01, -8.0322e-02,
           3.5447e-01, -1.9184e-02, -3.5998e-01, -1.1638e-01, -2.4305e-01,
          -7.0112e-01, -3.3669e-01, -1.4689e-01,  3.3988e-01, -2.5753e-01,
          -1.5439e-03,  2.4062e-01, -3.0499e-01, -7.5307e-02,  1.7059e-02,
           5.3581e-01,  2.4768e-01,  6.5554e-02, -2.7929e-02,  3.0515e-01,
          -2.0098e-02, -3.2828e-01,  1.6068e-02, -2.3718e-01, -2.0594e-01,
           2.9161e-01, -1.6416e-01,  2.9794e-01, -1.9010e-01, -5.9632e-01,
          -4.8052e-01, -2.0198e-01,  1.4822e-01,  3.4160e-03, -3.2066e-02,
           4.5537e-01, -3.8199e-01, -1.4097e-02,  3.1434e-01,  5.0808e-01,
           8.9550e-03,  4.2642e-01, -1.8845e-01,  8.5760e-01, -2.3655e-01,
           5.5712e-02, -1