# Performing Sentiment Analysis on Text Data

In [1]:
import numpy as np
import pandas as pd

df = pd.read_json("data/reviews_5_balanced.json.gz", lines=True)
df.sample(5)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewText,summary,unixReviewTime
86841,1,True,"05 23, 2015",A2W1XFPQPJFMRI,B0009JMW1C,Didn't do anything for me,One Star,1432339200
89784,2,True,"06 6, 2017",A3Q0RNCFOMFA1A,B001ASC022,The taste didn't do anything for me,Two Stars,1496707200
115834,2,True,"09 22, 2014",A120QLSB8F9LRW,B00CGSF5A4,RETURNED,Two Stars,1411344000
72339,2,True,"01 28, 2017",A1LMMPPG9NCNWH,B013VTS40E,I lose the cellphone signal,Two Stars,1485561600
271537,5,True,"03 23, 2016",ALTD6YE7J2AKU,B007BRGT6Y,"Worked great, no complaints.",Good product,1458691200


### Blueprint: Performing Sentiment Analysis using Lexicon-Based approaches

In [4]:
import nltk
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize

nltk.download("opinion_lexicon")

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.


True

In [5]:
print("Total number of words in opinion lexicon", len(opinion_lexicon.words()))
print("Examples of positive words in opinion lexicon", opinion_lexicon.positive()[:5])
print("Examples of negative words in opinion lexicon", opinion_lexicon.negative()[:5])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']


In [10]:
df.rename(columns={"reviewText": "text"}, inplace=True)
pos_score = 1
neg_score = -1
word_dict = {}

# Adding the positive words to the dictionary.
for word in opinion_lexicon.positive():
    word_dict[word] = pos_score

# Adding the negative words to the dictionary.
for word in opinion_lexicon.negative():
    word_dict[word] = neg_score


def bing_liu_score(text):
    sentiment_score = 0
    bag_of_words = word_tokenize(text.lower())

    for word in bag_of_words:
        if word in word_dict:
            sentiment_score += word_dict[word]
    return sentiment_score / len(bag_of_words)

In [11]:
df["bing_liu_score"] = df["text"].apply(bing_liu_score)
df[["asin", "text", "bing_liu_score"]].sample(2)

Unnamed: 0,asin,text,bing_liu_score
192836,B005UQSV3W,ok,0.0
70036,B00YG8MLOA,It broke the first time I used it.,-0.111111


In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))

df[["bing_liu_score"]] = scaler.fit_transform(df[["bing_liu_score"]])
df.groupby("overall").agg({"bing_liu_score": "mean"})

Unnamed: 0_level_0,bing_liu_score
overall,Unnamed: 1_level_1
1,-0.049764
2,-0.000157
4,0.238446
5,0.295418


## Supervised Learning Approaches

### Preparing Data for a Supervised Learning Approach

In [21]:
df = pd.read_json("data/reviews_5_balanced.json.gz", lines=True)

# Assigning a new [1, 0] target class label based on the product rating.
df["sentiment"] = 0
df.loc[df["overall"] > 3, "sentiment"] = 1
df.loc[df["overall"] < 3, "sentiment"] = 0

# Removing unnecessary columns to keep a simple DataFrame.
df.drop(
    columns=["reviewTime", "unixReviewTime", "overall", "reviewerID", "summary"],
    inplace=True,
)
df.sample(3)

Unnamed: 0,verified,asin,reviewText,sentiment
160475,True,B00OL1UWCK,Another TH product that I love.,1
185989,True,B0002Q80GS,Easy to install on my Honda RL,1
116814,True,B00JJAI5GS,Battery died in the remote.,0


### Blueprint: Vectorizing Text Data and Applying a Supervised Machine Learning Algorithm

**Step 1: Data Preparation**

In [24]:
%run preprocess.py
%run spacy_preprocess.py

df.rename(columns={"reviewText": "text"}, inplace=True)
df["text_orig"] = df["text"].copy()

In [25]:
from tqdm.notebook import tqdm

df["text"] = df["text"].apply(clean)

clean = []
for doc in tqdm(nlp.pipe(df["text"].values), total=len(df["text"])):
    # From Blueprint function
    lemmas = extract_lemmas(
        doc,
        exclude_pos=["PART", "PUNCT", "DET", "PRON", "SYM", "SPACE", "NUM"],
        filter_stops=True,
        filter_nums=True,
        filter_punct=True,
    )
    clean.append(lemmas)

df["text"] = clean

  0%|          | 0/294240 [00:00<?, ?it/s]

In [35]:
# Remove observations that are empty after the cleaning step.
df = df[df["text"].str.len() != 0]
df["text"] = df["text"].map(lambda x: " ".join(x))

**Step 2: Train-Test Split**

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["sentiment"],
    test_size=0.2,
    random_state=42,
    stratify=df["sentiment"],
)

print("Size of training data", X_train.shape[0])
print("Size of test data", X_test.shape[0])

print("Distribution of classes in training data:")
print("Positive sentiment", str(sum(y_train == 1) / len(y_train) * 100.0))
print("Negative sentiment", str(sum(y_train == 0) / len(y_train) * 100.0))

print("Distribution of classes in testing data:")
print("Positive sentiment", str(sum(y_test == 1) / len(y_test) * 100.0))
print("Negative sentiment", str(sum(y_test == 0) / len(y_test) * 100.0))

Size of training data 233966
Size of test data 58492
Distribution of classes in training data:
Positive sentiment 51.00612909568056
Negative sentiment 48.99387090431943
Distribution of classes in testing data:
Positive sentiment 51.00526567735759
Negative sentiment 48.994734322642415


**Step 3: Text Vectorization**

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tfidf = TfidfVectorizer(min_df=10, ngram_range=(1, 1))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

**Step 4: Training the Machine Learning Model**

In [39]:
from sklearn.svm import LinearSVC

model1 = LinearSVC(random_state=42, tol=1e-5)
model1.fit(X_train_tf, y_train)

LinearSVC(random_state=42, tol=1e-05)

In [40]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = model1.predict(X_test_tf)
print("Accuracy score -", accuracy_score(y_test, y_pred))
print("ROC-AUC score -", roc_auc_score(y_test, y_pred))

Accuracy score - 0.8553477398618614
ROC-AUC score - 0.8557214846429678


In [44]:
sample_reviews = df.sample(5)
sample_reviews_tf = tfidf.transform(sample_reviews["text"])
sentiment_predictions = model1.predict(sample_reviews_tf)
sentiment_predictions = pd.DataFrame(
    data=sentiment_predictions,
    index=sample_reviews.index,
    columns=["sentiment_prediction"],
)
sample_reviews = pd.concat([sample_reviews, sentiment_predictions], axis=1)
print("Some sample reviews with their sentiment - ")
sample_reviews[["text_orig", "sentiment_prediction"]]

Some sample reviews with their sentiment - 


Unnamed: 0,text_orig,sentiment_prediction
22683,"I don't know about the LINT FREE thingie, this...",0
122239,squirrels too nclever,0
273909,looks good on my DRZ,1
159790,Just as I expected.,1
110018,Um... go with a MXR instead,0


In [46]:
def baseline_score(text):
    score = bing_liu_score(text)
    if score > 0:
        return 1
    return 0


y_pred_baseline = X_test.apply(baseline_score)
acc_score = accuracy_score(y_pred_baseline, y_test)
acc_score

0.7571462764138686

## Pretrained Language Models using Deep Learning

**Step 1: Loading Models and Tokenization**

In [48]:
# !pip install transformers

In [49]:
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer

config = BertConfig.from_pretrained("bert-base-uncased", finetuning_task="binary")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [52]:
def get_tokens(text, tokenizer, max_seq_length, add_special_tokens=True):
    input_ids = tokenizer.encode(
        text,
        add_special_tokens=add_special_tokens,
        max_length=max_seq_length,
        padding="max_length",
        truncation=True,
    )
    attention_mask = [int(id > 0) for id in input_ids]
    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    return (input_ids, attention_mask)

In [53]:
text = "Here is the sentence I want embeddings for."
input_ids, attention_mask = get_tokens(
    text, tokenizer, max_seq_length=30, add_special_tokens=True
)
input_tokens = tokenizer.convert_ids_to_tokens(input_ids)

print(text)
print(input_tokens)
print(input_ids)
print(attention_mask)

Here is the sentence I want embeddings for.
['[CLS]', 'here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[101, 2182, 2003, 1996, 6251, 1045, 2215, 7861, 8270, 4667, 2015, 2005, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text_orig"],
    df["sentiment"],
    test_size=0.2,
    random_state=42,
    stratify=df["sentiment"],
)
X_train_tokens = X_train.apply(get_tokens, args=(tokenizer, 50))
X_test_tokens = X_test.apply(get_tokens, args=(tokenizer, 50))



In [56]:
#!pip install torch

In [57]:
import torch
from torch.utils.data import TensorDataset

input_ids_train = torch.tensor(
    [features[0] for features in X_train_tokens.values], dtype=torch.long
)
input_mask_train = torch.tensor(
    [features[1] for features in X_train_tokens.values], dtype=torch.long
)
label_ids_train = torch.tensor(y_train.values, dtype=torch.long)

print(input_ids_train.shape)
print(input_mask_train.shape)
print(label_ids_train.shape)

torch.Size([233966, 50])
torch.Size([233966, 50])
torch.Size([233966])


In [58]:
input_ids_train[1]

tensor([  101,  1045,  2001,  2409,  2011, 29237, 27082,  8013,  2326,  2008,
         1045,  2071,  3202,  2404,  2091,  5568,  6534,  1999,  1996,  6436,
         7516,  2073,  1996, 15289, 27082,  2001, 25401,  1010,  2021,  6854,
         2025,  2028,  6534,  1997,  5568,  2412, 16216, 27512,  4383,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [59]:
train_dataset = TensorDataset(input_ids_train, input_mask_train, label_ids_train)

**Step 2: Model Training**

In [60]:
from torch.utils.data import DataLoader, RandomSampler

train_batch_size = 64
num_train_epochs = 2

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset, sampler=train_sampler, batch_size=train_batch_size
)

t_total = len(train_dataloader) // num_train_epochs

print("Number examples =", len(train_dataset))
print("Num epochs =", num_train_epochs)
print("Total train batch size =", train_batch_size)
print("Total optimization steps =", t_total)

Number examples = 233966
Num epochs = 2
Total train batch size = 64
Total optimization steps = 1828


In [62]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

learning_rate = 1e-4
adam_epsilon = 1e-8
warmup_steps = 0

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

In [63]:
from tqdm import notebook, trange

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator = trange(num_train_epochs, desc="Epoch")

# Put model in 'train' mode.
model.train()

for epoch in train_iterator:
    epoch_iterator = notebook.tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
        # Reset all gradients at start of every iteration.
        model.zero_grad()

        # Put the model and the input observations to GPU.
        model.to(device)

        batch = tuple(t.to(device) for t in batch)

        # Identify the inputs to the model.
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}

        # Forward pass through the model. Input -> Model -> Output.
        outputs = model(**inputs)

        # Determine the deviation (loss).
        loss = outputs[0]

        print(f"\r{loss}", end="")

        # Back-propagate the loss (automatically calculates gradients).
        loss.backward()

        # Prevent exploding gradients by limiting gradients to 1.0
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update the parameters and learning rate.
        optimizer.step()
        scheduler.step()

Epoch:   0%|                                                                                                                     | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3656 [00:00<?, ?it/s]

0.6643181443214417

Epoch:   0%|                                                                                                                     | 0/2 [02:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model.save_pretrained("outputs")

**Step 3: Model Evaluation**

In [None]:
import numpy as np
from torch.utils.data import SequentialSampler

test_batch_size = 64
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset, sampler=test_sampler, batch_size=test_batch_size
)

# Load the pretrained model that was saved earlier.
model = model.from_pretrained("/outputs")


# Initialize the prediction and actual labels.
preds = None
out_label_ids = None

# Put model in "eval" mode.
model.eval()

for batch in notebook.tqdm(test_dataloader, desc="Evaluating"):
    # Put the model and the input observations to GPU.
    model.to(device)
    batch = tuple(t.to(device) for t in batch)

    # Do not track any gradients since in 'eval' mode.
    with torch.no_grad():
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}

        # Forward pass through the model.
        outputs = model(**inputs)

        # We get loss since we provided the labels.
        tmp_eval_loss, logits = outputs[:2]

        # There maybe more than one batch of items in the test dataset.
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()

        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(
                out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0
            )
            
# Get final loss, predictions and accuracy.
preds= np.argmax(preds, axis=1)
acc_score = accuracy_score(preds, out_label_ids)
print('Accuracy Score on test data', acc_score)