# Deep Learning in Natural Language Processing

Task is to implement Neural Network to classify Amazon Products reviews. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_dataframe = pd.read_csv('train.csv')
test_dataframe = pd.read_csv('test.csv')

train_dataframe.head()

Unnamed: 0,Title,Helpfulness,Score,Text,Category
0,Golden Valley Natural Buffalo Jerky,0/0,3.0,The description and photo on this product need...,grocery gourmet food
1,Westing Game,0/0,5.0,This was a great book!!!! It is well thought t...,toys games
2,Westing Game,0/0,5.0,"I am a first year teacher, teaching 5th grade....",toys games
3,Westing Game,0/0,5.0,I got the book at my bookfair at school lookin...,toys games
4,I SPY A is For Jigsaw Puzzle 63pc,2/4,5.0,Hi! I'm Martine Redman and I created this puzz...,toys games


In [3]:
def preprocess_score_inplace(df):
    """
    Normalizes score to make it from 0 to 1.
    
    For now it is from 1.0 to 5.0, so natural choice
    is to normalize by (f - 1.0)/4.0
    """
    df['Score'] = (df['Score'] - 1.0) / 4.0
    return df

def preprocess_helpfulness_inplace(df):
    """
    Splits feature by '/' and normalize helpfulness to make it from 0 to 1
    
    The total number of assessments can be 0, so let's substitute it
    with 1. The resulting helpfulness still will be zero but we
    remove the possibility of division by zero exception.
    """
    _splitted = df['Helpfulness'].str.split('/', expand=True)
    _helpful, _total = _splitted[0], _splitted[1]
    _total.replace("0", "1", inplace=True)
    df['Helpfulness'] = _helpful.astype(int) / _total.astype(int)
    return df    

In [4]:
def concat_title_text_inplace(df):
    """
    Concatenates Title and Text columns together
    """
    df['Text'] = df['Title'] + " " + df['Text']
    df.drop('Title', axis=1, inplace=True)
    return df

In [5]:
# define categories indices
cat2idx = {
    'toys games': 0,
    'health personal care': 1,
    'beauty': 2,
    'baby products': 3,
    'pet supplies': 4,
    'grocery gourmet food': 5,
}
# define reverse mapping
idx2cat = {
    v:k for k,v in cat2idx.items()
}

In [6]:
def encode_categories(df):
    df['Category'] = df['Category'].apply(lambda x: cat2idx[x])
    return df

In [7]:
train_copy = train_dataframe.head().copy()

encode_categories(preprocess_score_inplace(preprocess_helpfulness_inplace(concat_title_text_inplace(train_copy))))

Unnamed: 0,Helpfulness,Score,Text,Category
0,0.0,0.5,Golden Valley Natural Buffalo Jerky The descri...,5
1,0.0,1.0,Westing Game This was a great book!!!! It is w...,0
2,0.0,1.0,"Westing Game I am a first year teacher, teachi...",0
3,0.0,1.0,Westing Game I got the book at my bookfair at ...,0
4,0.5,1.0,I SPY A is For Jigsaw Puzzle 63pc Hi! I'm Mart...,0


### Text cleaning

In [8]:
import re

def lower_text(text: str):
    return text.lower()

def remove_numbers(text: str):
    """
    Substitute all punctuations with space in case of
    "there is5dogs".
    
    If subs with '' -> "there isdogs"
    With ' ' -> there is dogs
    """
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punctuation(text: str):
    """
    Substitute all punctiations with space in case of
    "hello!nice to meet you"
    
    If subs with '' -> "hellonice to meet you"
    With ' ' -> "hello nice to meet you"
    """
    text_nopunct = re.sub(r'[^a-z|\s]+', ' ', text)
    return text_nopunct

def remove_multiple_spaces(text: str):
    text_no_doublespace = re.sub('\s+', ' ', text).strip()
    return text_no_doublespace

In [9]:
sample_text = train_copy['Text'][4]

_lowered = lower_text(sample_text)
_without_numbers = remove_numbers(_lowered)
_without_punct = remove_punctuation(_without_numbers)
_single_spaced = remove_multiple_spaces(_without_punct)

print(sample_text)
print('-'*10)
print(_lowered)
print('-'*10)
print(_without_numbers)
print('-'*10)
print(_without_punct)
print('-'*10)
print(_single_spaced)

I SPY A is For Jigsaw Puzzle 63pc Hi! I'm Martine Redman and I created this puzzle for Briarpatch using a great photo from Jean Marzollo and Walter Wick's terrific book, I Spy School Days. Kids need lots of practice to master the ABC's, and this puzzle provides an enjoyable reinforcing tool. Its visual richness helps non-readers and readers alike to remember word associations, and the wealth of cleverly chosen objects surrounding each letter promote language development. The riddle included multiplies the fun of assembling this colorful puzzle. For another great Briarpatch puzzle, check out I Spy Blocks. END
----------
i spy a is for jigsaw puzzle 63pc hi! i'm martine redman and i created this puzzle for briarpatch using a great photo from jean marzollo and walter wick's terrific book, i spy school days. kids need lots of practice to master the abc's, and this puzzle provides an enjoyable reinforcing tool. its visual richness helps non-readers and readers alike to remember word associa

In [10]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))

from nltk.stem import PorterStemmer
ps = PorterStemmer()

def tokenize_text(text: str):
    return word_tokenize(text)

def remove_stop_words(tokenized_text: list[str]):
    return [
        w for w in tokenized_text
        if w not in stopwords_set
    ]

def stem_words(tokenized_text: list[str]):
    return [
        ps.stem(w)
        for w in tokenized_text
    ]



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
_tokenized = tokenize_text(_single_spaced)
_without_sw = remove_stop_words(_tokenized)
_stemmed = stem_words(_without_sw)

print(_single_spaced)
print('-'*10)
print(_tokenized)
print('-'*10)
print(_without_sw)
print('-'*10)
print(_stemmed)

i spy a is for jigsaw puzzle pc hi i m martine redman and i created this puzzle for briarpatch using a great photo from jean marzollo and walter wick s terrific book i spy school days kids need lots of practice to master the abc s and this puzzle provides an enjoyable reinforcing tool its visual richness helps non readers and readers alike to remember word associations and the wealth of cleverly chosen objects surrounding each letter promote language development the riddle included multiplies the fun of assembling this colorful puzzle for another great briarpatch puzzle check out i spy blocks end
----------
['i', 'spy', 'a', 'is', 'for', 'jigsaw', 'puzzle', 'pc', 'hi', 'i', 'm', 'martine', 'redman', 'and', 'i', 'created', 'this', 'puzzle', 'for', 'briarpatch', 'using', 'a', 'great', 'photo', 'from', 'jean', 'marzollo', 'and', 'walter', 'wick', 's', 'terrific', 'book', 'i', 'spy', 'school', 'days', 'kids', 'need', 'lots', 'of', 'practice', 'to', 'master', 'the', 'abc', 's', 'and', 'this

In [12]:
def preprocessing_stage(text):
    _lowered = lower_text(text)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punctuation(_without_numbers)
    _single_spaced = remove_multiple_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
    _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_without_sw)
    
    return _stemmed

def clean_text_inplace(df):
    df['Text'] = df['Text'].apply(preprocessing_stage)
    return df

def preprocess(df):
    df.fillna(" ", inplace=True)
    _preprocess_score = preprocess_score_inplace(df)
    _preprocess_helpfulness = preprocess_helpfulness_inplace(_preprocess_score)
    _concatted = concat_title_text_inplace(_preprocess_helpfulness)

    if 'Category' in df.columns:
        _encoded = encode_categories(_concatted)
        _cleaned = clean_text_inplace(_encoded)
    else:
        _cleaned = clean_text_inplace(_concatted)
    return _cleaned
    

In [13]:
train_preprocessed = preprocess(train_dataframe)
test_preprocessed = preprocess(test_dataframe)

train_preprocessed.head()

Unnamed: 0,Helpfulness,Score,Text,Category
0,0.0,0.5,"[golden, valley, natur, buffalo, jerki, descri...",5
1,0.0,1.0,"[west, game, great, book, well, thought, easil...",0
2,0.0,1.0,"[west, game, first, year, teacher, teach, th, ...",0
3,0.0,1.0,"[west, game, got, book, bookfair, school, look...",0
4,0.5,1.0,"[spi, jigsaw, puzzl, pc, hi, martin, redman, c...",0


In [14]:
from sklearn.model_selection import train_test_split

ratio = 0.2
train, val = train_test_split(
    train_preprocessed, stratify=train_preprocessed['Category'], test_size=0.2, random_state=420
)

In [15]:
from torchtext.vocab import GloVe

def yield_tokens(df):
    for _, sample in train.iterrows():
        yield sample.to_list()[2]
        
vocab = GloVe(name='6B', dim=50)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                               
100%|█████████▉| 399999/400000 [00:18<00:00, 21429.69it/s]


In [16]:
sample = train['Text'][2]
print(sample)
vectors = vocab.get_vecs_by_tokens(sample)
print(len(sample), vectors.shape)

['west', 'game', 'first', 'year', 'teacher', 'teach', 'th', 'grade', 'special', 'read', 'class', 'high', 'comprehens', 'level', 'read', 'book', 'one', 'best', 'thing', 'taught', 'year', 'expand', 'mind', 'allow', 'put', 'charact', 'place', 'easi', 'student', 'make', 'mind', 'movi', 'even', 'use', 'whole', 'read', 'class', 'time', 'order', 'finish', 'book', 'student', 'wait', 'hear', 'end', 'excel', 'book', 'read', 'everi', 'year', 'student']
51 torch.Size([51, 50])


In [51]:
import torch
from torch.utils.data import DataLoader

torch.manual_seed(420)

max_words = 50
embed_len = 50
batch_size = 1024

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list = []
    embeddings_tensor = torch.zeros(len(batch), max_words, embed_len)
    for i, (_, _, _text, _label) in enumerate(batch):
        text = _text
        if len(text) < max_words:
            text += [""] * (max_words - len(text))
        else:
            text = text[:max_words]

        label_list.append(_label)
        embeddings_tensor[i] = vocab.get_vecs_by_tokens(text)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    return label_list.to(device), embeddings_tensor.reshape(len(batch), -1).to(device)

train_dataloader = DataLoader(
    train.to_numpy(), batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)

val_dataloader = DataLoader(
    val.to_numpy(), batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

# Defining Network

In [78]:
import torch.nn as nn

class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        
        self.head = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(max_words*embed_len, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Dropout(0.3),

            nn.Linear(256,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Dropout(0.1),

            nn.Linear(64, num_classes),
        )

    def forward(self, batch):
        return self.head(batch)

In [79]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    total=0.0
    for i, batch in loop:
        labels, texts = batch
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward pass and loss calculation
        outputs = model(texts)
        loss = loss_fn(outputs, labels)
        
        total += labels.size(0)
        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/total})

def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):
    
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            labels, texts = batch

            # forward pass and loss calculation
            outputs = model(texts)
            loss = loss_fn(outputs, labels)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            val_loss += loss.item()
            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})
        
        if correct / total > best:
            torch.save(model.state_dict(), ckpt_path)
            return correct / total

    return best_so_far

In [80]:
epochs = 50
model = TextClassificationModel(
    num_classes=len(cat2idx)
).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

In [81]:
best = -float('inf')
for epoch in range(epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch_num=epoch)
    best = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best)

Epoch 0: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 0: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 1: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 2: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 2: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 3: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 3: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 4: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 4: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 5: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 5: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 6: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 6: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 7: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 7: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 8: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 8: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 9: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 9: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 10: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 10: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 11: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 11: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 12: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 12: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 13: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 13: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 14: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 14: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 15: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 15: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 16: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 16: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 17: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 17: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 18: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 18: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 19: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 19: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 20: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 20: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 21: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 21: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 22: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 22: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 23: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 23: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 24: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 24: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 25: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 25: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 26: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 26: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 27: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 27: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 28: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 28: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 29: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 29: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 30: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 30: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 31: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 31: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 32: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 32: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 33: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 33: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 34: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 34: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 35: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 35: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 36: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 36: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 37: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 37: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 38: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 38: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 39: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 39: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 40: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 40: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 41: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 41: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 42: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 42: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 43: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 43: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 44: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 44: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 45: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 45: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 46: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 46: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 47: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 47: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 48: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 48: val:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 49: train:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 49: val:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
def collate_batch(batch):
    label_list = []
    embeddings_tensor = torch.zeros(len(batch), max_words, embed_len)
    for i, (idx, _, _, _text) in enumerate(batch):
        text = _text
        if len(text) < max_words:
            text += [""] * (max_words - len(text))
        else:
            text = text[:max_words]
        embeddings_tensor[i] = vocab.get_vecs_by_tokens(text)

    return embeddings_tensor.reshape(len(batch), -1).to(device)

test_dataloader = DataLoader(
    test_preprocessed.to_numpy(), batch_size=128, shuffle=False, collate_fn=collate_batch
)

In [33]:
def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Predictions",
        leave=True,
    )
    predictions = []
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts = batch

            # forward pass and loss calculation
            outputs = model(texts)
            
            _, predicted = torch.max(outputs.data, 1)
            predictions += predicted.detach().cpu().tolist()

    return predictions

In [34]:
ckpt = torch.load("best.pt")
model.load_state_dict(ckpt)

<All keys matched successfully>

In [35]:
predictions = predict(model, test_dataloader)

Predictions:   0%|          | 0/79 [00:00<?, ?it/s]

In [36]:
results = pd.Series(predictions).apply(lambda x: idx2cat[x])
results