# Natural language processing: project - Model Demo

In [1]:
import os

In [2]:
CD_KEY = "--NLP_PROJECT_MODEL_DEMO_IN_ROOT"

In [3]:
if CD_KEY not in os.environ:
    os.environ[CD_KEY] = "false"

In [4]:
if (
    CD_KEY not in os.environ
    or os.environ[CD_KEY] is None
    or len(os.environ[CD_KEY]) == 0
    or os.environ[CD_KEY] == "false"
):
    %cd ..
else:
    print(os.getcwd())
    
os.environ[CD_KEY] = "true"

/mnt/data/projekti/faks/OPJe/project


## Importing modules

In [5]:
import json
from typing import Callable, List


from prado import PradoCore
from prado.datasets import pad_projections
from prado.datasets import ProcessedDataset
from prado.datasets import BasicPradoTransform, BasicPradoAugmentation
from torch.utils.data import DataLoader

from src.modelling.datasets import (
    ImdbDataset
)

## Defining functions

In [6]:
def get_elementwise_transformation(transformation: Callable):
    def _f(elements: List) -> List:
        return [transformation(x) for x in elements]
    
    return _f

## Initialization/Restoring last checkpoint

In [7]:

paths = {
    "training_dataset": "data/processed/ready-to-use/imdb/train.tsv",
}

augmentation_config = {
    "insertion_probability": 0.01,
    "deletion_probability": 0.01,
    "swap_probability": 0.01,
}

model_config = {
    "feature_length": 32,
    "embedding_length": 32,
    "dropout": 0.2,
    "out_channels": 3,
    "skipgram_patterns": [
        "1",
        "11",
        "101",
        "111",
    ],
    "out_features": 2,
}


## Setting up dataset

In [8]:
training_dataset = ImdbDataset(
    path=paths["training_dataset"],
    delimiter="\t",
    max_entries=10,
)

### Dataset preprocessing

In [9]:
basic_prado_transform = BasicPradoTransform()
basic_prado_augmentation = BasicPradoAugmentation(
    insertion_probability=augmentation_config["insertion_probability"],
    deletion_probability=augmentation_config["deletion_probability"],
    swap_probability=augmentation_config["swap_probability"],
)

In [10]:
training_dataset = ProcessedDataset(
    original_dataset=training_dataset,
    transformation_map={
        0: basic_prado_transform
    },
    verbosity=1,
)

Transforming dataset: 100%|██████████| 10/10 [00:00<00:00, 212.37it/s]


## Setting up model

In [11]:
model = PradoCore(
    feature_length=model_config["feature_length"],
    embedding_length=model_config["embedding_length"],
    dropout=model_config["dropout"],
    out_channels=model_config["out_channels"],
    skipgram_patterns=model_config["skipgram_patterns"],
    out_features=model_config["out_features"],
)

## Getting some results

In [12]:
dataloader = DataLoader(
    ProcessedDataset(
        original_dataset=training_dataset,
        transformation_map={
            0: get_elementwise_transformation(basic_prado_augmentation)
        },
        verbosity=1
    ),
    batch_size=2,
    collate_fn=pad_projections
)

Transforming dataset: 100%|██████████| 10/10 [00:00<00:00, 226.76it/s]


In [13]:
for tokens, labels in dataloader:
    result = model(tokens)
    print(result)

tensor([[-0.2164,  0.0978],
        [-0.2240,  0.1144]], grad_fn=<AddmmBackward>)
tensor([[-0.1725,  0.1611],
        [-0.2170,  0.1134]], grad_fn=<AddmmBackward>)
tensor([[-0.1918,  0.1222],
        [-0.2045,  0.1279]], grad_fn=<AddmmBackward>)
tensor([[-0.1715,  0.1608],
        [-0.2083,  0.1099]], grad_fn=<AddmmBackward>)
tensor([[-0.2292,  0.0933],
        [-0.1928,  0.1329]], grad_fn=<AddmmBackward>)
