# Setup

In [None]:
# Load necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas.plotting import register_matplotlib_converters


from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords as nltk_stopwords
from wordcloud import WordCloud, STOPWORDS

from transformers import AutoModel
import torch
from torch import nn
from torch.nn import functional as F

In [None]:
%matplotlib inline

# change the font_scale all at once
sns.set(font_scale=1.5)

# use the ggplot theme for our figures
plt.style.use('ggplot')

# setting some visualization parameters
# rcParams must come after style
plt.rcParams['figure.figsize'] = (16, 10)
plt.rcParams['axes.labelsize'] = 15
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [None]:
nltk.download("stopwords")

In [None]:
PATH = "data"
df = pd.read_csv(f'{PATH}//IMDB Dataset.csv')
df.head()

In [None]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
df.loc[0, 'review']

In [None]:
df.sentiment.value_counts()

# Visualizations

In [None]:
sns.countplot(data=df, x='sentiment');

In [None]:
wc_stopwords = set(STOPWORDS)
wc_stopwords.update(["br"])

In [None]:
positive_text = df.loc[df['sentiment'] == 1, 'review']
positive_text = ' '.join(txt for txt in positive_text.values)

negative_text = df.loc[df['sentiment'] == 0, 'review']
negative_text = ' '.join(txt for txt in negative_text.values)

In [None]:
def generate_wordcloud(txt):
    WC = WordCloud(stopwords=wc_stopwords, width=1000, height=500, max_words=500, min_font_size=5)
    wordcloud = WC.generate(txt)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
generate_wordcloud(positive_text)

In [None]:
generate_wordcloud(negative_text)

# Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['review'],
                                                    df['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
X_train.head(), y_train.head()

In [None]:
X_train.shape, y_train.shape, y_test.shape

In [None]:
stopwords_set = set(nltk_stopwords.words('english'))

In [None]:
vectorizer = TfidfVectorizer(stop_words=stopwords_set)
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

In [None]:
joblib.dump(vectorizer, open('data/vectorizer.joblib', 'wb'))

In [None]:
len(vectorizer.vocabulary_)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(X_train_tf, y_train)

In [None]:
clf.score(X_train_tf, y_train)

In [None]:
clf.score(X_test_tf, y_test)

In [None]:
preds = clf.predict(X_test_tf)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

In [None]:
joblib.dump(clf, open('data/logreg.joblib', 'wb'))

In [None]:
vocab_names = vectorizer.get_feature_names()
len(vocab_names)

In [None]:
from sklearn.model_selection import cross_val_score, KFold
cv = KFold(n_splits=10, shuffle=True, random_state=42)
X, y = X_train_tf, y_train
cv_score = cross_val_score(clf, X, y, cv=cv)
print(np.mean(cv_score))

In [None]:
cv_score

# Checking wrong predictions

In [None]:
wrong_preds = y_test[y_test != preds]

In [None]:
wrong_preds.head()

In [None]:
wrong_preds.value_counts()

In [None]:
number_of_reviews = 3
indices = [random.choice(wrong_preds.index) for _ in range(number_of_reviews)]

for idx in indices:
    txt, target = df.loc[idx]
    txt = np.array([txt])
    txt_tf = vectorizer.transform(txt)
    pred_one = clf.predict(txt_tf)[0]
    sentiment = class_names[pred_one]
    wrapped_txt = "\n".join(wrap(txt[0]))

    print(f'Review text:\n{wrapped_txt}\n')
    print(f'Actual    : {class_names[target]}')
    print(f'Predicted : {sentiment}\n')
    # break

## Model Interpretation with ELI5

In [None]:
import eli5

eli5.show_weights(estimator=clf, 
                  feature_names=vocab_names,
                  top=(20,20))

## Tuning

In [None]:
clf.get_params()

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

solvers = ['newton-cg', 'lbfgs']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]


param_grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, 
                           n_jobs=10, cv=cv, scoring='accuracy',
                           error_score=0, refit=True, verbose=3
                          )
grid_result = grid_search.fit(X_train_tf, y_train)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print(f'Test score -> {grid_result.score(X_test_tf, y_test)}')

## Other models

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=10, random_state=42)
clf.fit(X_train_tf, y_train)

In [None]:
depths = [tree.tree_.max_depth for tree in clf.estimators_]
print(f"Mean tree depth in the Random Forest: {np.round(np.mean(depths))}")

In [None]:
clf.score(X_test_tf, y_test)

In [None]:
pred = clf.predict(X_test_tf)

In [None]:
print(classification_report(y_test, pred))

In [None]:
from sklearn.svm import SVC

clf = SVC(random_state=42)
clf.fit(X_train_tf, y_train)

In [None]:
clf.score(X_test_tf, y_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train_tf, y_train)

In [None]:
clf.score(X_test_tf, y_test)

# Deep learning model

## Preprocessing

In [None]:
PRETRAINED_MODEL_NAME = 'distilbert-base-uncased'

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [None]:
sample_txt = 'Is this the real life? Is this just fantasy?'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
tokenizer.encode(sample_txt)

In [None]:
tokenizer(sample_txt, padding=True, truncation=True, return_tensors="pt")

In [None]:
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length=32,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    truncation=True,
    return_tensors='pt',  # Return PyTorch tensors
)

In [None]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))

In [None]:
token_lens = []
for txt in df.review:
    tokens = tokenizer.encode(txt, max_length=1024)
    token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 1024])
plt.xlabel('Token count');

In [None]:
# max length of words in one review
df.review.str.split().apply(len).max()

In [None]:
# arbitrary number based on the max token count,
# avoiding long reviews...
MAX_LEN = 512

In [None]:
encoding = tokenizer(sample_txt,
                     padding='max_length',
                     truncation=True,
                     max_length=MAX_LEN,
                     return_tensors="pt")

In [None]:
encoding.keys()

## Dataset and DataLoader

In [None]:
from torch import nn
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, reviews, sentiments, tokenizer, max_len):
        self.reviews = reviews
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        sentiment = self.sentiments[item]
        
        encoding = tokenizer(review,
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len,
                             return_tensors="pt")
        
        return {
            'review': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(sentiment, dtype=torch.long)
        }

In [None]:
ds = ReviewDataset(
    reviews=df.review.to_numpy(),
    sentiments=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
ds[0].keys()

In [None]:
df_train, df_test = train_test_split(df,
                                     test_size=0.2,
                                     stratify=df.sentiment.values,
                                     random_state=42
                                    )
df_val, df_test = train_test_split(df_test,
                                   test_size=0.5,
                                   stratify=df_test.sentiment.values,
                                   random_state=42
                                  )
df_train.shape, df_val.shape, df_test.shape

In [None]:
df_train.sentiment.value_counts(), df_val.sentiment.value_counts(), df_test.sentiment.value_counts()

In [None]:
def create_dl(df, tokenizer, max_len, batch_size):
    ds = ReviewDataset(
        reviews=df.review.to_numpy(),
        sentiments=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size
    )

In [None]:
BATCH_SIZE = 32

train_data_loader = create_dl(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_dl(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_dl(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

In [None]:
encoding = tokenizer(data['review'],
                     padding='max_length',
                     truncation=True,
                     max_length=MAX_LEN,
                     return_tensors="pt")

In [None]:
vocab = tokenizer.get_vocab()
len(vocab)

In [None]:
inv_voc = {v: k for k, v in vocab.items()}

In [None]:
list(inv_voc.items())[:5]

In [None]:
data['review'][0]

In [None]:
" ".join(inv_voc[i] for i in encoding["input_ids"][0].numpy())

# DistilBert Model building

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained(PRETRAINED_MODEL_NAME)

In [None]:
output = model(**encoding)

In [None]:
print(dir(output))

In [None]:
output['last_hidden_state'].shape

In [None]:
output[0].shape

In [None]:
output[0][:, 0].shape

In [None]:
model.config.hidden_size

In [None]:
# based on https://www.kaggle.com/alexalex02/sentiment-analysis-distilbert-amazon-reviews/notebook

from transformers import AutoConfig, AutoModel

class DistilBert(nn.Module):

    def __init__(self, pretrained_model_name=PRETRAINED_MODEL_NAME, num_classes=2):
        super().__init__()
        config = AutoConfig.from_pretrained(pretrained_model_name)

        self.distilbert = AutoModel.from_pretrained(pretrained_model_name,
                                                    config=config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.dropout = nn.Dropout(config.seq_classif_dropout)
        self.out = nn.Linear(config.dim, num_classes)

    def forward(self, input_ids, attention_mask=None, head_mask=None):
        assert attention_mask is not None, "attention mask is none"
        
        model_output = self.distilbert(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       head_mask=head_mask)
        hidden_state = model_output[0]  # [BATCH_SIZE=BS, MAX_SEQ_LENGTH = 512, DIM = 768]
        # get the first token as pooled_output
        pooled_output = hidden_state[:, 0]  # [BS, 768]
        pooled_output = self.pre_classifier(pooled_output)  # [BS, 768]
        pooled_output = F.relu(pooled_output)  # [BS, 768]
        pooled_output = self.dropout(pooled_output)  # [BS, 768]
        output = self.out(pooled_output)  # [BS, 2]

        return output

In [None]:
device = torch.device('cuda')

In [None]:
model = DistilBert(num_classes=2)
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
targets = data['targets'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
with torch.no_grad():
    output = model(input_ids, attention_mask)
    print(torch.max(F.softmax(output.detach(), dim=1), dim=1))

In [None]:
output.shape

In [None]:
output[:5]

In [None]:
a, b = torch.max(output, dim=1)

In [None]:
a, b

In [None]:
targets

In [None]:
torch.sum(b == targets)

In [None]:
!nvidia-smi

## Training

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

EPOCHS = 2

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
    ):

    model = model.train()
    losses = []
    correct_predictions = 0

    for dl in data_loader:
        input_ids = dl["input_ids"].to(device)
        attention_mask = dl["attention_mask"].to(device)
        targets = dl["targets"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(
    model,
    data_loader,
    loss_fn,
    device,
    n_examples
    ):

    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for dl in data_loader:
            input_ids = dl["input_ids"].to(device)
            attention_mask = dl["attention_mask"].to(device)
            targets = dl["targets"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

from collections import defaultdict
history = defaultdict(list)
best_acc = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
        )
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
        )
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_acc:
        torch.save(model.state_dict(), f'{PATH}//best_model_state.bin')
        best_acc = val_acc

In [None]:
plt.plot(history['train_acc'], label='Training accuracy')
plt.plot(history['val_acc'], label='Validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend();

# Evaluation

In [None]:
state_dict_path = 'data//best_model_state.bin'
class_names = ['negative', 'positive']
PRETRAINED_MODEL_NAME = 'distilbert-base-uncased'

In [None]:
model = DistilBert(PRETRAINED_MODEL_NAME, len(class_names))
model.load_state_dict(
    torch.load(state_dict_path, map_location=device)
)
model = model.to(device)
model = model.eval()

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
test_acc, _ = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    len(df_test)
    )

test_acc.item()

In [None]:
test_acc

In [None]:
next(iter(test_data_loader)).keys()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  
  review_texts = []
  predictions = []
  prediction_probs = []
  real_preds = []

  with torch.no_grad():
    for d in data_loader:

      texts = d["review"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_preds.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_preds = torch.stack(real_preds).cpu()
  return review_texts, predictions, prediction_probs, real_preds

In [None]:
y_review, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
from sklearn.metrics import classification_report

class_names = ['negative', 'positive']
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
from sklearn.metrics import confusion_matrix

def show_confusion_matrix(confusion_matrix):
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
print(cm); display(df_cm)

# Predicting Random Reviews

In [None]:
import random

idx = random.randint(0, len(y_test))

review = y_review[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame({
    'class_names': class_names,
    'values': y_pred_probs[idx]
    })

In [None]:
from textwrap import wrap

print("\n".join(wrap(review)))
print()
print(f'True sentiment: {class_names[true_sentiment]}')

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x='values', y='class_names', data=pred_df)
plt.ylabel('sentiment')
plt.xlabel('probability');

In [None]:
idx = random.randint(0, len(df))
review = df.review[idx]

encoded_review = tokenizer(
    review,
    padding='max_length',
    truncation=True,
    max_length=MAX_LEN,
    return_tensors="pt")

In [None]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

wrapped_review = "\n".join(wrap(review))
print(f'Review text: {wrapped_review}\n')
print(f'Sentiment  : {class_names[prediction]}')

# Check incorrect predictions

In [None]:
wrong_pred_idx = np.where(y_test != y_pred)[0]
len(wrong_pred_idx)

In [None]:
sorted(y_pred_probs[wrong_pred_idx].numpy(), key=lambda x: x.max(), reverse=True)[:5]

In [None]:
prob_idx_dict = {k: v.numpy() for k, v in zip(wrong_pred_idx, y_pred_probs[wrong_pred_idx])}

In [None]:
sorted_wrong_pred = sorted(prob_idx_dict.items(), 
                           key=lambda x: x[1].max(),
                           reverse=True)

In [None]:
sorted_wrong_pred[:5]

In [None]:
top_k = 5

for idx, _ in sorted_wrong_pred[:top_k]:
    wrapped_review = "\n".join(wrap(y_review[idx]))

    print(f'Review text:\n{wrapped_review}\n')
    print(f'Actual    : {class_names[y_test[idx]]}')
    print(f'Predicted : {class_names[y_pred[idx]]}\n')
    # break

In [None]:
review_str = 'This sleek, sexy movie is a must-see'
df[df.review.str.contains(review_str)]

**Obviously something was wrong about the original labellings for such reviews.**

If the labels were correct, then the accuracy of the model would be much better,
as proven by the results of the predictions.