In [None]:
%load_ext autoreload
%autoreload 2

# NLP - Natural Language Processing

> Natural Language Processing

In [None]:
#| default_exp nlp

## Initial Checks

In [None]:
#|eval: false
!conda list | grep "pytorch"

ffmpeg                    4.3                  hf484d3e_0    pytorch
pytorch                   2.0.1           py3.11_cuda11.8_cudnn8.7.0_0    pytorch
pytorch-cuda              11.8                 h7e8668a_5    pytorch
pytorch-ignite            0.4.12                   pypi_0    pypi
pytorch-lightning         2.0.6                    pypi_0    pypi
pytorch-mutex             1.0                        cuda    pytorch
torchaudio                2.0.2               py311_cu118    pytorch
torchtriton               2.0.0                     py311    pytorch
torchvision               0.15.2              py311_cu118    pytorch


In [None]:
#|eval: false
!pip list | grep "fastai" 
!pip list | grep "fastbook"
!pip list | grep "ipywidgets"

fastai                        2.7.12
fastbook                      0.0.29
ipywidgets                    7.7.5


In [None]:
#|eval: false
import torch

In [None]:
#|eval: false
torch.cuda.is_available()

True

in the book, doing NLP using RNNs (recurrent neural networks). 

We are using transformers, using Hugging Face Transformers.


1. Wikitext - Language Model
2. IMDb - Language Model - use wikitest as pretraining
3. IMDb - Classifier - use IMDb as pretraining

```sh
!pip install kaggle
```

## Kaggle setup

In [None]:
#|eval: false
import os

# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path

In [None]:
#|eval: false
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

creds = '{"username":"bensonthekkel","key":"5d0c64462ee63521393fead641685ce8"}'

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [None]:
#|eval: false
path = Path('us-patent-phrase-to-phrase-matching')
dataPath = Path(f'./Data/{path}')

In [None]:
#|eval: false
if not iskaggle and not dataPath.exists():
    import zipfile,kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(dataPath)

In [None]:
#|eval: false
file_path = f'{path}.zip'  # Replace with the actual path of the zip file

if os.path.exists(file_path):
    os.remove(file_path)
    print(f"File '{file_path}' has been deleted.")
else:
    print(f"File '{file_path}' does not exist.")

File 'us-patent-phrase-to-phrase-matching.zip' does not exist.


## Need libraries for data science

1. Numpy
2. Matplotlib
3. pandas
4. pytorch

In [None]:
#|eval: false
import pandas as pd

In [None]:
#|eval: false
df = pd.read_csv(dataPath/'train.csv')
df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [None]:
#|eval: false
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


In [None]:
#|eval: false
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor

In [None]:
#|eval: false
df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

## Tokenization

In [None]:
#|eval: false
from datasets import Dataset,DatasetDict
from transformers import AutoModelForSequenceClassification,AutoTokenizer



In [None]:
#|eval: false
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score'],
    num_rows: 36473
})

In [None]:
#|eval: false
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)
tokz.tokenize("G'day folks, I'm Jeremy from fast.ai!")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['▁G',
 "'",
 'day',
 '▁folks',
 ',',
 '▁I',
 "'",
 'm',
 '▁Jeremy',
 '▁from',
 '▁fast',
 '.',
 'ai',
 '!']

In [None]:
#|eval: false
tokz.tokenize("A platypus is an ornithorhynchus anatinus.")

['▁A',
 '▁platypus',
 '▁is',
 '▁an',
 '▁or',
 'ni',
 'tho',
 'rhynch',
 'us',
 '▁an',
 'at',
 'inus',
 '.']

In [None]:
#|eval: false
def tok_func(x): return tokz(x["input"])

In [None]:
#|eval: false
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

In [None]:
#|eval: false
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

In [None]:
#|eval: false
tokz.vocab['▁of']

265

In [None]:
#|eval: false
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [None]:
#|eval: false
eval_df = pd.read_csv(dataPath/'test.csv')
eval_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,el display,inorganic photoconductor drum,G02
freq,1,2,1,3


## Overfitting

In [None]:
from fastAIcourse.neuralnet import *

In [None]:
??fastAIcourse.neuralnet

Object `fastAIcourse.neuralnet` not found.


In [None]:
def f(x): return -3*x**2 + 2*x + 20

In [None]:
plot_function(f)

NameError: name 'torch' is not defined