# Homework and bake-off: Sentiment analysis

In [1]:
__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2021"

## Set-up

See [the first notebook in this unit](sst_01_overview.ipynb#Set-up) for set-up instructions.

In [2]:
from collections import Counter
import random
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import torch
import torch.nn as nn
from torch_rnn_classifier import TorchRNNClassifier
from torch_tree_nn import TorchTreeNN
import sst
import sst_mod
from sklearn.metrics import classification_report
from iit import get_IIT_sentiment_dataset, get_IIT_sentiment_devset
from torch_bert_classifier_IIT import TorchBertClassifierIIT
from torch_deep_neural_classifier_iit import TorchDeepNeuralClassifierIIT
from torch_rnn_classifier import TorchRNNClassifier
import utils

In [3]:
SST_HOME = os.path.join('data', 'sentiment')
LEFT = 0
RIGHT = 1
BOTH = 2

## A Softmax Baseline

Sets up two softmax models: one baseline for sentiment analysis, and the other used as the causal backbone for the IIT training, combining the sentiments of two subtrees extracted from SST-3.

In [4]:
def unigrams_phi(text):
    return Counter(text.split())

Thin wrapper around `LogisticRegression` for the sake of `sst.experiment`:

In [5]:
def fit_softmax_classifier(X, y):
    mod = LogisticRegression(
        fit_intercept=True,
        solver='liblinear',
        multi_class='ovr')
    mod.fit(X, y)
    return mod

The experimental run with some notes:

In [6]:
softmax_experiment = sst.experiment(
    sst.train_reader(SST_HOME),   # Train on any data you like except SST-3 test!
    unigrams_phi,                 # Free to write your own!
    fit_softmax_classifier,       # Free to write your own!
    assess_dataframes=[sst.dev_reader(SST_HOME), sst.bakeoff_dev_reader(SST_HOME)]) # Free to change this during development!

Assessment dataset 1
              precision    recall  f1-score   support

    negative      0.628     0.689     0.657       428
     neutral      0.343     0.153     0.211       229
    positive      0.629     0.750     0.684       444

    accuracy                          0.602      1101
   macro avg      0.533     0.531     0.518      1101
weighted avg      0.569     0.602     0.575      1101

Assessment dataset 2
              precision    recall  f1-score   support

    negative      0.272     0.692     0.391       565
     neutral      0.429     0.113     0.179      1019
    positive      0.409     0.346     0.375       777

    accuracy                          0.328      2361
   macro avg      0.370     0.384     0.315      2361
weighted avg      0.385     0.328     0.294      2361

Mean of macro-F1 scores: 0.416


In [7]:
def one_hot(label):
    sents = ['positive', 'neutral', 'negative']
    return np.eye(len(sents))[sents.index(label)]

def build_dataset_subtrees(dataframes, phi, vectorizer=None, vectorize=True):
    if isinstance(dataframes, (list, tuple)):
        df = pd.concat(dataframes)
    else:
        df = dataframes

    raw_examples = list(df.sentence.values)

    # feat_dicts = list(df.left_label.apply(phi).values)
    left_labels = df.left_label.values
    right_labels = df.right_label.values

    feat_dicts = [np.concatenate((one_hot(left_labels[i]), one_hot(right_labels[i]))) for i in range(len(left_labels))]

    if 'sentence_label' in df.columns:
        labels = list(df.sentence_label.values)
    else:
        labels = None

    feat_matrix = None
    if vectorize:
        # In training, we want a new vectorizer:
        if vectorizer is None:
            vectorizer = DictVectorizer(sparse=False)
            feat_matrix = vectorizer.fit_transform(feat_dicts)
        # In assessment, we featurize using the existing vectorizer:
        else:
            feat_matrix = vectorizer.transform(feat_dicts)
    else:
        feat_matrix = feat_dicts

    return {'X': feat_matrix,
            'y': labels,
            'vectorizer': vectorizer,
            'raw_examples': raw_examples}

In [8]:
sentiment_iit_train_df = pd.read_csv('sst_tree_train.csv')
print('Train size:', sentiment_iit_train_df.shape[0])
sentiment_iit_dev_df = pd.read_csv('sst_tree_dev.csv')
print('Dev size:', sentiment_iit_dev_df.shape[0])

# can potentially train softmax on both train and dev datasets, but we'll only focus on train?
softmax_tree_experiment = sst_mod.experiment(
    sentiment_iit_train_df,
    unigrams_phi,
    fit_softmax_classifier,
    assess_dataframes=[sentiment_iit_dev_df],
    vectorize=False,
    build_dataset_fn=build_dataset_subtrees)

Train size: 8544
Dev size: 1101
              precision    recall  f1-score   support

    negative      1.000     1.000     1.000       428
     neutral      1.000     1.000     1.000       229
    positive      1.000     1.000     1.000       444

    accuracy                          1.000      1101
   macro avg      1.000     1.000     1.000      1101
weighted avg      1.000     1.000     1.000      1101



## Deep Neural Classifier IIT Training

Runs an example of IIT training for sentiment analysis using the `DeepNeuralClassifier` IIT model.

In [9]:
softmax_root_model = softmax_tree_experiment['model']

train_size = 80
train_dataset = sentiment_iit_train_df.sample(train_size, replace=False)
# X_base_train, X_sources_train, y_base_train, y_IIT_train, interventions_train, vectorizer = \
#     get_IIT_sentiment_dataset(sentiment_iit_train_df.sample(train_size, replace=False), softmax_root_model, LEFT, unigrams_phi)

# train on both left and right subtree nodes
left_train = get_IIT_sentiment_dataset(train_dataset, softmax_root_model, LEFT, unigrams_phi)
vectorizer = left_train[5]
right_train = get_IIT_sentiment_dataset(train_dataset, softmax_root_model, RIGHT, unigrams_phi, vectorizer=vectorizer)

X_base_train = torch.cat([left_train[0], right_train[0]], dim=0)
X_sources_train = [torch.cat([left_train[1][i], right_train[1][i]], dim=0) for i in range(len(left_train[1]))] 
y_base_train = torch.cat([left_train[2], right_train[2]])
y_IIT_train = torch.cat([left_train[3], right_train[3]])
interventions_train = torch.cat([left_train[4], right_train[4]])

print(X_base_train.shape)

{('positive', 'positive'): 2361, ('positive', 'neutral'): 861, ('positive', 'negative'): 378, ('neutral', 'positive'): 105, ('neutral', 'neutral'): 205, ('neutral', 'negative'): 330, ('negative', 'positive'): 399, ('negative', 'neutral'): 779, ('negative', 'negative'): 982}
{('positive', 'positive'): 2025, ('positive', 'negative'): 630, ('positive', 'neutral'): 945, ('neutral', 'neutral'): 180, ('neutral', 'positive'): 192, ('neutral', 'negative'): 268, ('negative', 'neutral'): 720, ('negative', 'positive'): 648, ('negative', 'negative'): 792}
torch.Size([12800, 723])


In [10]:
embedding_dim = X_base_train.shape[1]
# similar to our alignment in the IIT accuracy section?
# aligning V1 to left side of layer 1, and V2 to the right side
# we are defining both as a list with two values -- why not encode it as a single range from 0  to dim * 2?
id_to_coords = {LEFT:{1: [{"layer":1, "start":0, "end":embedding_dim}]}, \
    RIGHT: {1: [{"layer":1, "start":embedding_dim, "end":embedding_dim*2}]}, \
    BOTH: {1: [{"layer":1, "start":0, "end":embedding_dim},{"layer":1, "start":embedding_dim, "end":embedding_dim*2}]}}

# gives back an IIT dataset based off of the Premack dataset, coming up with 
# all possible permutations of same/different shape pairs and same/different base-source pairs?
# X_base_train, X_sources_train, y_base_train, y_IIT_train, interventions = get_IIT_equality_dataset("V1", embedding_dim ,data_size)

# this is a different model from the one we defined in the previous cell, but with a similar idea?
model = TorchDeepNeuralClassifierIIT(hidden_dim=embedding_dim*4, hidden_activation=torch.nn.ReLU(), num_layers=3, id_to_coords=id_to_coords)
# model.fit() function internally calls on model.create_dataset(), which creates dataset in a way that pairs off
# source and base inputs?
_ = model.fit(X_base_train, X_sources_train, y_base_train, y_IIT_train, interventions_train)

# this is a runtime error I've also encountered in antra (with no change to the original code)
# could this be due to mismatching pytorch versions??

Stopping after epoch 198. Training loss did not improve more than tol=1e-05. Final error is 1.092532455921173.

In [11]:
# save trained IIT model
# model.to_pickle('deep_neural_classifier_iit.pickle') # pickle seems to throw error for IIT
torch.save(model.model.state_dict(), 'deep_neural_classifier_iit.pt')

In [12]:
# # Load IIT model from saved state dictionary
# # unpickled = TorchBertClassifierIIT.from_pickle('deep_neural_classifier_iit.pickle')
# # not the prettiest, but must construct model by first calling on fit()
# model = TorchDeepNeuralClassifierIIT(hidden_dim=embedding_dim*4, hidden_activation=torch.nn.ReLU(), num_layers=3, id_to_coords=id_to_coords)
# _ = model.fit(X_base_train, X_sources_train, y_base_train, y_IIT_train, interventions_train)

# model.model.load_state_dict(torch.load('deep_neural_classifier_iit.pt'))

In [13]:
# tests the IIT model on interchange intervention accuracy using the SST-3 dev set
dev_iit_size = 40

dev_set_iit = sentiment_iit_dev_df.sample(dev_iit_size, replace=False)

# test for intervention on left subtree
X_base_dev, X_sources_dev, y_base_dev, y_IIT_dev, interventions_dev, v = \
    get_IIT_sentiment_dataset(dev_set_iit, softmax_root_model, LEFT, unigrams_phi, vectorizer)
IIT_preds, base_preds = model.model(model.prep_input(X_base_dev, X_sources_dev, interventions_dev))
IIT_preds = np.array(IIT_preds.argmax(axis=1).cpu())
base_preds = np.array(base_preds.argmax(axis=1).cpu())
print("Accuracy of base model")
print(classification_report(y_base_dev, base_preds))

{('negative', 'negative'): 491, ('negative', 'positive'): 13, ('negative', 'neutral'): 16, ('positive', 'positive'): 533, ('neutral', 'negative'): 143, ('neutral', 'positive'): 169, ('neutral', 'neutral'): 208, ('positive', 'negative'): 11, ('positive', 'neutral'): 16}
Accuracy of base model
              precision    recall  f1-score   support

           0       0.42      0.36      0.38       560
           1       0.43      0.46      0.44       520
           2       0.43      0.46      0.44       520

    accuracy                           0.42      1600
   macro avg       0.42      0.43      0.42      1600
weighted avg       0.42      0.42      0.42      1600



In [14]:
# split across cells for readability of output
print("Interchange intervention accuracy on left")
print(classification_report(y_IIT_dev, IIT_preds))

# test for intervention on right subtree
X_base_dev, X_sources_dev, y_base_dev, y_IIT_dev, interventions_dev, v = \
    get_IIT_sentiment_dataset(dev_set_iit, softmax_root_model, RIGHT, unigrams_phi, vectorizer)
IIT_preds, base_preds = model.model(model.prep_input(X_base_dev, X_sources_dev, interventions_dev))
IIT_preds = np.array(IIT_preds.argmax(axis=1).cpu())
print("--------------------------------------------------------")
print("Interchange intervention accuracy on right")
print(classification_report(y_IIT_dev, IIT_preds))

Interchange intervention accuracy on left
              precision    recall  f1-score   support

           0       0.39      0.23      0.29       715
           1       0.17      0.40      0.24       240
           2       0.39      0.38      0.39       645

    accuracy                           0.32      1600
   macro avg       0.32      0.34      0.31      1600
weighted avg       0.36      0.32      0.32      1600

{('negative', 'negative'): 321, ('negative', 'positive'): 169, ('positive', 'negative'): 168, ('positive', 'positive'): 377, ('positive', 'neutral'): 15, ('negative', 'neutral'): 30, ('neutral', 'negative'): 156, ('neutral', 'neutral'): 195, ('neutral', 'positive'): 169}
--------------------------------------------------------
Interchange intervention accuracy on right
              precision    recall  f1-score   support

           0       0.36      0.20      0.26       715
           1       0.15      0.37      0.22       240
           2       0.37      0.37      0.3

In [15]:
# tests the IIT model on the regular sentiment classification task
dev_size = 100
dev_dataset = sst.bakeoff_dev_reader(SST_HOME) # .sample(dev_size, replace=False)

X_base_dev, X_sources_dev, y_base_dev, y_IIT_dev, interventions_dev = get_IIT_sentiment_devset(
    dev_dataset, LEFT, unigrams_phi, vectorizer)

IIT_preds, base_preds = model.model(model.prep_input(X_base_dev, X_sources_dev, interventions_dev))
base_preds = np.array(base_preds.argmax(axis=1).cpu())
print(classification_report(y_base_dev, base_preds))

              precision    recall  f1-score   support

           0       0.38      0.56      0.45       777
           1       0.49      0.39      0.44      1019
           2       0.24      0.16      0.20       565

    accuracy                           0.40      2361
   macro avg       0.37      0.37      0.36      2361
weighted avg       0.39      0.40      0.38      2361



## BERT IIT Training

Builds off of the Deep Neural Classifier IIT Training example, and trains an IIT model based on finetuning BERT for sentiment analysis.

In [16]:
def bert_fine_tune_phi(text):
    return text

In [28]:
dim = 768 # taken from finetuning.ipynb file
half_dim = dim // 2

layer = 3 # tryyy to go one from the top?
id_to_coords = {LEFT:{layer: [{"layer":layer, "start":0, "end":half_dim}]}, \
    RIGHT: {layer: [{"layer":layer, "start":half_dim, "end":dim}]}, \
    BOTH: {layer: [{"layer":layer, "start":layer, "end":half_dim}, {"layer":layer, "start":half_dim, "end":dim}]}}
    
bert_model = TorchBertClassifierIIT(id_to_coords,
                                    n_iter_no_change=5,
                                    # max_iter=2,
                                    batch_size=8,
                                    eta=0.0001)
# bert_model = TorchBertClassifierIIT(id_to_coords)

train_size = 12 # this is very small, but maybe not toooo small?
X_base, X_sources, y_base, y_IIT, interventions, vectorizer = \
    get_IIT_sentiment_dataset(sentiment_iit_train_df.sample(train_size), softmax_root_model, LEFT, 
                              bert_fine_tune_phi, vectorize=False)

_ = bert_model.fit(X_base, X_sources, y_base, y_IIT, interventions)

{('neutral', 'neutral'): 27, ('neutral', 'negative'): 6, ('neutral', 'positive'): 3, ('negative', 'negative'): 52, ('negative', 'neutral'): 18, ('negative', 'positive'): 2, ('positive', 'positive'): 36}


Stopping after epoch 196. Training loss did not improve more than tol=1e-05. Final error is 0.0006829852700320771.

In [29]:
# save trained IIT model
# model.to_pickle('deep_neural_classifier_iit.pickle') # pickle seems to throw error for IIT
torch.save(bert_model.model.state_dict(), 'bert_classifier_iit.pt')

In [30]:
# tests the IIT model on interchange intervention accuracy using the SST-3 dev set
dev_iit_size = 10
dev_set_iit = sentiment_iit_dev_df.sample(dev_iit_size, replace=False)

# test for intervention on left subtree
X_base_dev, X_sources_dev, y_base_dev, y_IIT_dev, interventions_dev, v = \
    get_IIT_sentiment_dataset(dev_set_iit, softmax_root_model, LEFT, bert_fine_tune_phi, vectorize=False)
IIT_preds, base_preds = bert_model.model(bert_model.prep_input(X_base_dev, X_sources_dev, interventions_dev))
IIT_preds = np.array(IIT_preds.argmax(axis=1).cpu())
base_preds = np.array(base_preds.argmax(axis=1).cpu())
print("Accuracy of base model")
print(classification_report(y_base_dev, base_preds))

{('neutral', 'neutral'): 16, ('neutral', 'negative'): 12, ('neutral', 'positive'): 12, ('negative', 'negative'): 30, ('positive', 'positive'): 30}
Accuracy of base model
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        30
           1       0.42      0.25      0.31        40
           2       0.25      0.60      0.35        30

    accuracy                           0.28       100
   macro avg       0.22      0.28      0.22       100
weighted avg       0.24      0.28      0.23       100



In [31]:
# split across cells for readability of output
print("Interchange intervention accuracy on left")
print(classification_report(y_IIT_dev, IIT_preds))

# test for intervention on right subtree
X_base_dev, X_sources_dev, y_base_dev, y_IIT_dev, interventions_dev, v = \
    get_IIT_sentiment_dataset(dev_set_iit, softmax_root_model, RIGHT, bert_fine_tune_phi, vectorize=False)
IIT_preds, base_preds = bert_model.model(bert_model.prep_input(X_base_dev, X_sources_dev, interventions_dev))
IIT_preds = np.array(IIT_preds.argmax(axis=1).cpu())
print("--------------------------------------------------------")
print("Interchange intervention accuracy on right")
print(classification_report(y_IIT_dev, IIT_preds))

Interchange intervention accuracy on left
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.12      0.25      0.17        16
           2       0.41      0.67      0.51        42

    accuracy                           0.32       100
   macro avg       0.18      0.31      0.23       100
weighted avg       0.19      0.32      0.24       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{('neutral', 'neutral'): 16, ('neutral', 'negative'): 12, ('neutral', 'positive'): 12, ('negative', 'negative'): 21, ('negative', 'positive'): 9, ('positive', 'positive'): 21, ('positive', 'negative'): 9}
--------------------------------------------------------
Interchange intervention accuracy on right
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.08      0.12      0.10        16
           2       0.38      0.67      0.49        42

    accuracy                           0.30       100
   macro avg       0.15      0.26      0.19       100
weighted avg       0.17      0.30      0.22       100



In [32]:
# evaluate IIT model on bakeoff dev set, measuring regular accuracy
dev_size = 100

X_base_dev, X_sources_dev, y_base_dev, y_IIT_dev, interventions_dev  = \
    get_IIT_sentiment_devset(sst.bakeoff_dev_reader(SST_HOME).sample(dev_size, replace=False), LEFT, bert_fine_tune_phi, None, False)

y_predict, y_IIT_predict = bert_model.model(bert_model.prep_input(X_base_dev, X_sources_dev, interventions_dev))
y_predict = np.array(y_predict.argmax(axis=1).cpu())
print(classification_report(y_base_dev, y_predict))

              precision    recall  f1-score   support

           0       0.75      0.07      0.13        42
           1       0.40      0.69      0.51        36
           2       0.27      0.41      0.33        22

    accuracy                           0.37       100
   macro avg       0.47      0.39      0.32       100
weighted avg       0.52      0.37      0.31       100



## Sandbox

Just a scratch workspace for BERT IIT training and paramater seach (one day)

In [33]:
# # test BERT model on small example

# base = ['This is just a single test']
# sources = [base]
# coord_ids = [0] * len(base)

# LABELS = ['positive', 'neutral', 'negative']
# _, y_ = bert_model.model(bert_model.prep_input(base, sources, coord_ids))
# y_ = np.array(y_.argmax(axis=1).cpu())
# y_

In [34]:
# def fit_iit_bert_classifier_with_hyperparameter_search(X, y):
#     basemod = TorchBertClassifierIIT(
#         weights_name='bert-base-cased',
#         batch_size=8,  # Small batches to avoid memory overload.
#         max_iter=1,  # We'll search based on 1 iteration for efficiency.
#         n_iter_no_change=5,   # Early-stopping params are for the
#         early_stopping=True)  # final evaluation.

#     param_grid = {
#         'gradient_accumulation_steps': [1, 4, 8],
#         'eta': [0.00005, 0.0001, 0.001]}

#     bestmod = utils.fit_classifier_with_hyperparameter_search(
#         X, y, basemod, cv=3, param_grid=param_grid)

#     return bestmod

In [35]:
# %%time
# bert_classifier_xval = sst.experiment(
#     sst.train_reader(SST_HOME),
#     bert_fine_tune_phi,
#     fit_iit_bert_classifier_with_hyperparameter_search,
#     assess_dataframes=sst.dev_reader(SST_HOME),
#     vectorize=False)  # Pass in the BERT hidden state directly!

In [36]:
# optimized_bert_classifier = bert_classifier_xval['model']
# del bert_classifier_xval

In [37]:
# def fit_optimized_hf_bert_classifier(X, y):
#     optimized_bert_classifier.max_iter = 1000
#     optimized_bert_classifier.fit(X, y)
#     return optimized_bert_classifier

In [38]:
# %%time
# _ = sst.experiment(
#     sst.train_reader(SST_HOME),
#     bert_fine_tune_phi,
#     fit_optimized_hf_bert_classifier,
#     assess_dataframes=test_df,
#     vectorize=False)  # Pass in the BERT hidden state directly!