# Data Preparation

## Load Packages

In [1]:
# It takes 2 minutes to run this cell 
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.4/777.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
You should consider upgrading via the '/Users/alexandredias/.pyenv/versions/3.9.13/envs/sdg-classifier/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
import numpy as np
import pandas as pd
import glob
import os
import time
# import gdown

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

# For advanced NLP Processing
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import en_core_web_lg
import re

from unicodedata import normalize, combining
from tqdm import tqdm

In [4]:
!python --version

Python 3.9.13


In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexandredias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexandredias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
os.chdir(parent_dir)

## Load data

`load_data` ler todos os arquivos individuais de todas as SDGs e os concatena, retornando um dataframe.

In [None]:
# Download CSV files
# url = "https://drive.google.com/drive/folders/1-cwm0B2kVXpbTT4qbLfEJlyRFMtD1gDm"
# gdown.download_folder(url)

In [10]:
# Load data of all SDGs from every source file 
def load_data():
    files = glob.glob("./data/csv/sdg/*.csv")
    print(files)
    datasets = []
    for file in files:
        datasets.append(pd.read_csv(file, sep="\t"))
        
    for i, dataset in enumerate(datasets):
        mlb = MultiLabelBinarizer()
        targets = mlb.fit_transform(dataset["Sustainable Development Goals (2021)"].\
                                    str.replace(" ", "").\
                                    str.split("|"))
        targets_dataframe = pd.DataFrame(targets, columns=mlb.classes_, dtype=np.float32)

        datasets[i] = pd.concat([datasets[i], targets_dataframe], axis=1)
        datasets[i] = datasets[i].drop(columns=["Sustainable Development Goals (2021)"])
        
    samples = []
    for dataset in datasets:
        samples.append(dataset)
        
    data = pd.concat(samples)
    data = data.rename(columns={"Title": "text"})
    data = data.reset_index(drop=True)
    return data

**Balanceamento de dataset para tarefa de classifição multilabel**

A ocorrência dos rótulos (SDGs) é severamente desbalanceada. Considere, por exemplo, a quantidade de ocorrências tirada do dataset geral para cada SDG:

|SDG1|SDG2|SDG3|SDG4|SDG5|SDG6|SDG7|SDG8|SDG9|SDG10|SDG11|SDG12|SDG13|SDG14|SDG15|SDG16|
|----|----|----|----|----|----|----|----|----|-----|-----|-----|-----|-----|-----|-----|
|83470|181140|249191|151681|120897|238204|343391|278841|351881|222361|297820|256995|264076|135624|167590|159497|

Note que algumas SDGs têm mais de 300k ocorrências, enquanto algumas outras tem menos de 200k ocorrências. Problemas de classificação com datasets desbalanceadas são ainda mais desafiadores. Para nossa conveniências, e levando em conta que temos muitos dados, podemos tentar balancear esse conjunto de dados para só então iniciar o processo de treinamento de fato.

Pensando nisso, foi desenvolvido um algoritmo para balancear o dataset. A ideia geral é realizada em 5 passos:

1. Identificar qual classe tem a menor ocorrência no dataset geral (`data`). Vamos chamar essa classe de `base_class`.
2. Coletar do dataset geral todas as amostras com ocorrência da `base_class`. Criar novo dataset com essas amostras, chamado `keeper`.
3. Após a remover todas as ocorrências da `base_class` do `data`: Identificar quais classes restaram tal que a sua quantidade de ocorrências sejam menores que a quantidade de ocorrências da `base_class`. Vamos chamar o conjunto das classes identificadas de `compromised`. O complemento da classe compromised, é o conjunto de classes com ocorrências maiores que a `base_class`, vamos chamá-los de `intransigent`.

4. Para cada classe `i` do conjunto `compromised`:
- Usando o dataset `data`, colete todas as amostras com ocorrências da classe `i` de `compromised` tais que nessas amostras não hajam ocorrências das classes de `intransigent`. O conjunto dessas amostras coletadas será chamado de `concession`.
- Adicione `n_samples` amostras do conjunto `concession` ao conjunto `keeper`, onde N = (_Número de ocorrências da `base_class`_ - _Número de ocorrências da classe `i` no conjunto_ `compromised`)

5. Repita o item 3 e 4 até que _o conjunto `compromised` pare de mudar ou fique vazio.

In [11]:
def balance_dataframe(data):
    # step 1
    counts = data.sum(axis=0)
    base_class_count, base_class_idx = np.min(counts), np.argmin(counts)
    
    # step 2
    # initiliaze keeper dataset
    keeper = data[data.iloc[:, base_class_idx] == 1]

    # remove records added to the keeper dataset
    data = data[data.iloc[:, base_class_idx] == 0]
    
    # step 3
    # identify classes from keeper that have more instances than base_class_count
    intransigent = np.sum(keeper, axis=0) >= base_class_count
    
    while True:
        
        compromised = np.sum(keeper, axis=0) < base_class_count
        
        # step 5.1: check if compromised stopped changing
        if np.all(intransigent == compromised):
            return keeper
        
        # step 4
        # step 4.1
        intransigent_classes_idx = np.concatenate(np.argwhere(np.array(~compromised)))
        
        balance_mask = np.full((data.shape[0],), True)
        for j in intransigent_classes_idx:
            balance_mask = balance_mask & (data.iloc[:, j] == 0)
        
        concession = data.loc[balance_mask, :]
        data = data.loc[balance_mask, :]
        
        # step 5.1: check if concession only have 0's (is empty)
        if sum(np.sum(concession)) == 0:
            return keeper
        
        # step 4.2
        compromised_classes_idx = np.array(compromised).nonzero()[0]

        if len(compromised_classes_idx) > 0:
            compromised_class = np.array(compromised).nonzero()[0][0]
            
            n_sampleable = np.sum(concession.iloc[:, compromised_class])

            n_samples = base_class_count - np.sum(keeper, axis=0)[compromised_classes_idx[0]]
            
            if n_samples > n_sampleable:
                n_samples = n_sampleable
                
            data = data.loc[concession.iloc[:, compromised_class] == 0, :]
            concession = concession[concession.iloc[:, compromised_class] == 1][:n_samples]
        
        # update keeper and intransigent sets for the next iteration
        keeper = pd.concat([keeper, concession])
        intransigent = compromised

In [12]:
data = load_data()

['./data/csv/sdg/sdg_02.csv', './data/csv/sdg/sdg_16.csv', './data/csv/sdg/sdg_03.csv', './data/csv/sdg/sdg_15.csv', './data/csv/sdg/sdg_01.csv', './data/csv/sdg/sdg_14.csv', './data/csv/sdg/sdg_10.csv', './data/csv/sdg/sdg_04.csv', './data/csv/sdg/sdg_05.csv', './data/csv/sdg/sdg_11.csv', './data/csv/sdg/sdg_07.csv', './data/csv/sdg/sdg_13.csv', './data/csv/sdg/sdg_12.csv', './data/csv/sdg/sdg_06.csv', './data/csv/sdg/sdg_08.csv', './data/csv/sdg/sdg_09.csv']


In [13]:
# Remove duplicate titles
data = data.drop_duplicates()

# Remove duplicate titles with different targets
counts = data["text"].value_counts()
titles, counts = list(counts.index), list(counts)

for title, count in zip(titles, counts):
    if count > 1:
        data = data.loc[data["text"] != title, :]
    
    # since the list of counts is ordered, if it gets to count == 1, then we can break the loop
    if count == 1:
        break

In [14]:
titles = data[["text"]]
unbalanced_targets = data.iloc[:, 1:]

# get balanced dataset based on SDGs occurrences
unbalanced_targets = unbalanced_targets.astype(np.int64)
targets = balance_dataframe(unbalanced_targets).astype(np.float32)

# Set indices as a column for further dataframe merging
targets["index"] = targets.index
titles["index"] = titles.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles["index"] = titles.index


In [15]:
# Join targets and titles dataframe
data = targets.merge(titles, how="left", on="index")
data = data.set_index("index")

columns = ["text", "SDG1", "SDG2", "SDG3", "SDG4", "SDG5", "SDG6", "SDG7", "SDG8", "SDG9", 
           "SDG10", "SDG11", "SDG12", "SDG13", "SDG14", "SDG15", "SDG16"]
data = data[columns]

## Preparations

### Split train, validation, and test sets

In [None]:
sdg_columns = list(data.columns[1:])
X_train, X_test, y_train, y_test = train_test_split(np.array(data["text"]), np.array(data[sdg_columns]),
                                                    test_size=0.2, random_state=42)

In [None]:
train_size = round(0.9 * X_train.shape[0])

X_valid, y_valid = X_train[train_size:], y_train[train_size:]
X_train, y_train = X_train[:train_size], y_train[:train_size]

In [None]:
print("train set: \t{} records.".format(X_train.shape[0]))
print("validation set: {} records.".format(X_valid.shape[0]))
print("test set: \t{} records.".format(X_test.shape[0]))

train set: 	454432 records.
validation set: 50492 records.
test set: 	126232 records.


### Preprocess

O pré-processamento textual é dividido em duas etapas:

1. Na primeira etapa, são realizadas operações avançadas de PLN que não podem ser realizadas com funções nativas do TensorFlow, tais como: 
- Remoção de acentos, acentuaçao e caracteres especiais;
- Remoção de stopwords;
- Lematização ou Stemming;
- Filtragem.

2. A segunda etapa consiste em converter os datasets de `numpy` para o formato padrão do TensorFlow `tf.data.Dataset`. 

Ainda antes de alimentar o modelo com este dataset, precisamos vetorizar as sequências de texto. Esta etapa é realizada apenas no notebook de treinamento do modelo. Ela consiste em passar os dados por uma camada `TextVectorization` nativa do TensorFlow. Essa camada realiza:
- Padding das sequências de texto;
- codificação/vetorização de palavras.

#### Advanced NLP Preprocessing

In [None]:
def get_stopwords():
    nltk_stopwords = nltk.corpus.stopwords.words("english")
    spacy_en = en_core_web_lg.load()
    spacy_stopwords = spacy_en.Defaults.stop_words
    
    stopwords = list(set(spacy_stopwords).union(set(nltk_stopwords)))
 
    return stopwords

In [None]:
def advanced_preprocess(X, y, truncation="lemma"):
    # Convert text to lowercase
    Z = [text.lower() for text in X] 

    # Remove special characters
    special_char_reg_ex="!@#$%^&*()[]{};:,./<>?\|`~-=_+123456789"
    Z = [text.translate({ord(char): " " for char in special_char_reg_ex}) for text in Z]

    # Remove numbers
    Z = [re.sub(r"^\d+\s|\s\d+\s|\s\d+$|\d+\)", ' ', text) for text in Z]

    # Remove double spaces
    Z = [re.sub(r"\s+[a-zA-Z]\s+", ' ', text) for text in Z]

    # Remove accents
    Z = ["".join([char for char in normalize("NFKD", text) if not combining(char)]) for text in Z]

    # Tokenize text
    Z = [word_tokenize(text) for text in Z]

    # Remove stopwords
    stopwords = get_stopwords()
    Z = [list((word for word in tokens if ((word not in stopwords) and (len(word) > 1)))) for tokens in Z]
    
    # Lemmatizing
    if truncation == "lemma":
        # Concatenate tokens
        Z = [" ".join(tokens) for tokens in Z]

        # Lemmatize sentences
        nlp = en_core_web_lg.load()
        lemmatize = lambda sentence: " ".join([token.lemma_ for token in nlp(sentence)])
        Z = [lemmatize(text) for text in tqdm(Z)]
    
    # Stemming
    if truncation == "stem":
        stemmer = SnowballStemmer("english")
        Z = [" ".join([stemmer.stem(token) for token in tokens]) for tokens in Z]
        
    if truncation is None:
        Z = [" ".join(tokens) for tokens in Z]

    # Convert back to np.array
    Z = np.array(Z)
    
    # Discard empty sentences 
    non_empty_sentences = Z != "" 
    y = y[non_empty_sentences]
    Z = Z[non_empty_sentences]
    
    return Z, y

In [None]:
X_train, y_train = advanced_preprocess(X_train, y_train)

100%|██████████| 454432/454432 [1:22:04<00:00, 92.28it/s]


In [None]:
X_valid, y_valid = advanced_preprocess(X_valid, y_valid)

100%|██████████| 50492/50492 [09:10<00:00, 91.76it/s]


In [None]:
X_test, y_test = advanced_preprocess(X_test, y_test)

100%|██████████| 126232/126232 [22:40<00:00, 92.79it/s]


#### Build TF Dataset

In [None]:
def create_dataset(X, y):
    return tf.data.Dataset.from_tensor_slices((tf.constant(X), tf.constant(y))) 

In [None]:
batch_size = 32

# build train set
train_set = create_dataset(X_train, y_train).\
    shuffle(X_train.shape[0], seed=42).batch(batch_size).prefetch(1)

# build validation set
valid_set = create_dataset(X_valid, y_valid).batch(batch_size).prefetch(1)

# build test set
test_set = create_dataset(X_test, y_test).batch(batch_size).prefetch(1)

In [None]:
!mkdir datasets_sdg_tensorflow

In [None]:
# stores tf datasets
tf.data.experimental.save(train_set, "./datasets_sdg_tensorflow/train_set")
tf.data.experimental.save(valid_set, "./datasets_sdg_tensorflow/valid_set")
tf.data.experimental.save(test_set, "./datasets_sdg_tensorflow/test_set")