In [1]:
import gzip
import mlflow
import pandas as pd
import tempfile
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from gensim import corpora
from gensim.parsing import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm, trange

use_cuda = torch.cuda.is_available()
device = torch.device('cuda') if use_cuda else torch.device('cpu')

# EDA

In [2]:
dataset = pd.read_csv("./spanish.train.csv.gz")
dataset

Unnamed: 0,language,label_quality,title,category
0,spanish,reliable,Bateria Completa 5 Cuerpos Excelente,DRUMS
1,spanish,reliable,Cuaderno Anotador Espiral Ben 10 3d Original ...,NOTEBOOKS_AND_WRITING_PADS
2,spanish,reliable,Fifa18 Ps4 Disco Fisico,VIDEO_GAMES
3,spanish,reliable,Botines Futbol adidas Messi 15.4 Cesped Hombre,FOOTBALL_SHOES
4,spanish,reliable,Chops Sublimados - Nagual,DRINKING_GLASSES
...,...,...,...,...
6119095,spanish,unreliable,Plantas Medicinales De Valeriana Roja!!!!,PLANTS
6119096,spanish,unreliable,Patas Altas Sommier 25 Cm. - X 4 Unidades,BOX_SPRING_AND_MATTRESS_SETS
6119097,spanish,unreliable,Combo Funda Asiento Univ Cuero Automotor Cub A...,CAR_SEAT_COVERS
6119098,spanish,unreliable,Griferia Baño Fv Margot Lever Lavatorio Pared ...,BATHROOM_FAUCET_SETS


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6119100 entries, 0 to 6119099
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   language       object
 1   label_quality  object
 2   title          object
 3   category       object
dtypes: object(4)
memory usage: 186.7+ MB


In [4]:
dataset.describe()

Unnamed: 0,language,label_quality,title,category
count,6119100,6119100,6119100,6119100
unique,1,2,6119100,632
top,spanish,unreliable,40 Dijes Corazones Colores Surtidos,BOOKS
freq,6119100,5635232,1,19010


# Dataset

In [5]:
class MeLiDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform
    
    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, item):
        if torch.is_tensor(item):
            item = item.to_list()
        
        item = {
            "data": self.dataset.loc[item, "title"],
            "target": self.dataset.loc[item, "category"]
        }
        
        if self.transform:
            item = self.transform(item)
        
        return item

# Preprocesamiento

In [6]:
class RawDataProcessor:
    def __init__(self, 
                 dataset, 
                 ignore_header=True, 
                 filters=None, 
                 vocab_size=50000):
        if filters:
            self.filters = filters
        else:
            self.filters = [
                lambda s: s.lower(),
#                 preprocessing.strip_tags,
#                 preprocessing.strip_punctuation,
                preprocessing.strip_multiple_whitespaces,
#                 preprocessing.strip_numeric,
#                 preprocessing.remove_stopwords,
#                 preprocessing.strip_short,
            ]
        
        # Create dictionary based on all the reviews (with corresponding preprocessing)
        self.dictionary = corpora.Dictionary(
            dataset["title"].map(self._preprocess_string).tolist()
        )
        # Filter the dictionary and compactify it (make the indices continous)
        self.dictionary.filter_extremes(no_below=2, no_above=1, keep_n=vocab_size)
        self.dictionary.compactify()
        # Add a couple of special tokens
        self.dictionary.patch_with_special_tokens({
            "[PAD]": 0,
            "[UNK]": 1
        })
        self.idx_to_target = sorted(dataset["category"].unique())
        self.target_to_idx = {t: i for i, t in enumerate(self.idx_to_target)}

    def _preprocess_string(self, string):
        return preprocessing.preprocess_string(string, filters=self.filters)

    def _sentence_to_indices(self, sentence):
        return self.dictionary.doc2idx(sentence, unknown_word_index=1)
    
    def encode_data(self, data):
        return self._sentence_to_indices(self._preprocess_string(data))
    
    def encode_target(self, target):
        return self.target_to_idx[target]
    
    def __call__(self, item):
        if isinstance(item["data"], str):
            data = self.encode_data(item["data"])
        else:
            data = [self.encode_data(d) for d in item["data"]]
        
        if isinstance(item["target"], str):
            target = self.encode_target(item["target"])
        else:
            target = [self.encode_target(t) for t in item["target"]]
        
        return {
            "data": data,
            "target": target
        }

# Lectura de datos

In [7]:
train_dataset = pd.read_csv("./spanish.train.csv.gz")
test_dataset = pd.read_csv("./spanish.test.csv.gz")

preprocess = RawDataProcessor(dataset)

# train_indices, test_indices = train_test_split(dataset.index, test_size=0.2, random_state=42)

train_dataset = MeLiDataset(train_dataset, transform=preprocess)

test_dataset = MeLiDataset(test_dataset, transform=preprocess)

print(f"Datasets loaded with {len(train_dataset)} training elements and {len(test_dataset)} test elements")
print(f"Sample train element:\n{train_dataset[0]}")

Datasets loaded with 6119100 training elements and 63680 test elements
Sample train element:
{'data': [50001, 2, 50000, 3, 4], 'target': 196}


In [8]:
class PadSequences:
    def __init__(self, pad_value=0, max_length=None, min_length=1):
        assert max_length is None or min_length <= max_length
        self.pad_value = pad_value
        self.max_length = max_length
        self.min_length = min_length

    def __call__(self, items):
        data, target = list(zip(*[(item["data"], item["target"]) for item in items]))
        seq_lengths = [len(d) for d in data]

        if self.max_length:
            max_length = self.max_length
            seq_lengths = [min(self.max_length, l) for l in seq_lengths]
        else:
            max_length = max(self.min_length, max(seq_lengths))

        data = [d[:l] + [self.pad_value] * (max_length - l)
                for d, l in zip(data, seq_lengths)]
            
        return {
            "data": torch.LongTensor(data),
            "target": torch.FloatTensor(target)
        }

# DataLoaders

In [9]:
pad_sequences = PadSequences()
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True,
                          collate_fn=pad_sequences, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False,
                         collate_fn=pad_sequences, drop_last=False)

# NN Classification Model

In [10]:
class MeLiFasttextClassifier(nn.Module):
    def __init__(self, 
                 word_embeddings, 
                 config,
                 vocab_size):
        super().__init__()
        self.config = config
        
        # Embedding Layer
        self.embeddings = nn.Embedding(vocab_size, self.config.embed_size)
        self.embeddings.weight = nn.Parameter(word_embeddings, requires_grad=False)
        
        # Hidden Layer
        self.fc1 = nn.Linear(self.config.embed_size, self.config.hidden_size)
        
        # Output Layer
        self.fc2 = nn.Linear(self.config.hidden_size, self.config.output_size)
        
        # Softmax non-linearity
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        embedded_sent = self.embeddings(x).permute(1,0,2)
        h = self.fc1(embedded_sent.mean(1))
        z = self.fc2(h)
        return self.softmax(z)

In [11]:
class MeLiMLPClassifier(nn.Module):
    def __init__(self, 
                 pretrained_embeddings_path, 
                 dictionary,
                 vector_size,
                 freeze_embedings):
        super().__init__()
        embeddings_matrix = torch.randn(len(dictionary), vector_size)
        embeddings_matrix[0] = torch.zeros(vector_size)
        with gzip.open(pretrained_embeddings_path, "rt") as fh:
            for line in fh:
                word, vector = line.strip().split(None, 1)
                if word in dictionary.token2id:
                    embeddings_matrix[dictionary.token2id[word]] =\
                        torch.FloatTensor([float(n) for n in vector.split()])
        self.embeddings = nn.Embedding.from_pretrained(embeddings_matrix,
                                                       freeze=freeze_embedings,
                                                       padding_idx=0)
        self.hidden1 = nn.Linear(vector_size, 128)
        self.hidden2 = nn.Linear(128, 128)
        self.output = nn.Linear(128, 1)
        self.vector_size = vector_size
    
    def forward(self, x):
        x = self.embeddings(x)
        x = torch.mean(x, dim=1)
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = torch.sigmoid(self.output(x))
        return x

In [12]:
class MeLiCNNClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [13]:
class MeLiRNNClassifier(nn.Module):
    def __init__(self, vocab_size, dimensions=128,
                 num_layers=1, dropout=0.5, bias=True,
                 bidirectional=True):
        
        super(MeLiRNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, 300)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=300,
                            hidden_size=dimension,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=bidirectional)
        self.drop = nn.Dropout(p=dropout)

        self.fc = nn.Linear(2*dimension, 1)
        
        # Instanciate the layers
        self.encoder = nn.LSTM(**self.lstm_config)
        self.decoder = nn.Sequential()
        self.decoder.add_module('linear', nn.Linear(**self.linear_config))
        self.decoder.add_module('softmax',nn.LogSoftmax(dim=-1))

    def forward(self, text, text_len):
        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)

        return text_out

# Training & ML Flow

In [14]:
mlflow.set_experiment("AprendizajeProfundo_MLP_experiment")

n_epochs = 5
embedding_size = 50
freeze_embedding = True

with mlflow.start_run():
    mlflow.log_param("model_name", "MeLiMLPClassifier")
    mlflow.log_param("freeze_embedding", freeze_embedding)
    mlflow.log_params({
        "embedding_size": embedding_size,
        "hidden1_size": 128,
        "hidden2_size": 128
    })
    model = MeLiMLPClassifier("./glove.6B.50d.txt.gz", preprocess.dictionary, embedding_size, freeze_embedding)
    loss = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    for epoch in trange(n_epochs):
        model.train()
        running_loss = []
        for idx, batch in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            output = model(batch["data"])
            loss_value = loss(output, batch["target"].view(-1, 1))
            loss_value.backward()
            optimizer.step()
            running_loss.append(loss_value.item())        
        mlflow.log_metric("train_loss", sum(running_loss) / len(running_loss), epoch)
        
        model.eval()
        running_loss = []
        targets = []
        predictions = []
        for batch in tqdm(test_loader):
            output = model(batch["data"])
            running_loss.append(
                loss(output, batch["target"].view(-1, 1)).item()
            )
            targets.extend(batch["target"].numpy())
            predictions.extend(output.squeeze().detach().numpy())
        mlflow.log_metric("test_loss", sum(running_loss) / len(running_loss), epoch)
        mlflow.log_metric("test_bacc", balanced_accuracy_score(targets, predictions), epoch)
    
    with tempfile.TemporaryDirectory() as tmpdirname:
        targets = []
        predictions = []
        for batch in tqdm(test_loader):
            output = model(batch["data"])
            targets.extend(batch["target"].numpy())
            predictions.extend(output.squeeze().detach().numpy())
        pd.DataFrame({"prediction": predictions, "target": targets}).to_csv(
            f"{tmpdirname}/predictions.csv.gz", index=False
        )
        mlflow.log_artifact(f"{tmpdirname}/predictions.csv.gz")

INFO: 'AprendizajeProfundo_MLP_experiment' does not exist. Creating a new experiment


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/47806 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/47806 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/47806 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/47806 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/47806 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]