## Prepare data and libraries

In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = 'transformers_cache'

import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
import re
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from transformers import BertTokenizer, BertModel, BertConfig
import string
import time
import torch
from textstat import sentence_count

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
model_names = ['all-distilroberta-v1', 'all-MiniLM-L12-v2', 'all-mpnet-base-v2']

filepath = 'data/sta/e4_content/int{number}_bloom.csv'
filepath_inf = 'data/sta/e4_content/int{number}_bloom_inf.csv'
filepath_dsi = 'data/sta/e4_content/int{number}_bloom_dsi.csv'
filepath_mad = 'data/sta/e4_content/int{number}_bloom_mad.csv'

In [3]:
load_ds = lambda number: pd.read_csv(filepath.format(number=number), sep=',', index_col=0)
load_model = lambda number: SentenceTransformer(model_names[number])

In [4]:
def d_metric(string):
    string_list = string.split()
    counts = np.unique(string_list, return_counts=True)[1]
    numerator = np.sum(counts*(counts-1))
    n = len(string_list)
    denominator = n*(n-1)
    return numerator/denominator

topwords = list(pd.read_csv('1-1000.txt', sep='~', header=None)[0])
def common_words(string):
    string_list = string.split()
    result = sum([word in topwords for word in string_list])
    return result

def sentence_analysis(ds):
    ds['CharacterNumber'] = ds.Question.str.len()
    ds['WordNumber'] = ds.Question.str.count(' ') + 1
    ds['CommonWordNumber'] = ds.Question.apply(common_words) / ds.WordNumber
    ds['UniqueWordNumber'] = ds.Question.str.split().apply(set).apply(len)
    ds['TTR'] = ds.UniqueWordNumber / ds.WordNumber
    ds['CTTR'] = ds.UniqueWordNumber / (ds.WordNumber*2)**0.5
    ds['DMetric'] = ds.Question.apply(d_metric)
    ds['SyllableNumber'] = ds.Question.str.count('(?!e$)[aeiouy]+') + ds.Question.str.count('^[^aeiouy]*e$')
    ds['SentenceNumber'] = ds.Question.apply(sentence_count)
    ds['MeanSentenceLength'] = ds.CharacterNumber/ds.SentenceNumber
    ds['FRES'] = 206.835 - 1.015 * ds.MeanSentenceLength - 84.6 * ds.SyllableNumber / ds.WordNumber
    ds['FKGL'] = 0.39 * ds.MeanSentenceLength + 11.8 * ds.SyllableNumber / ds.WordNumber - 15.59
    ds['ARI'] = 0.5 * ds.MeanSentenceLength + 47.1 * ds.CharacterNumber / ds.WordNumber - 21.34
    return ds

In [5]:
droberta = load_model(0)
minilm = load_model(1)
mpnet = load_model(2)

No sentence-transformers model found with name sentence-transformers/all-distilroberta-v1. Creating a new one with MEAN pooling.
No sentence-transformers model found with name sentence-transformers/all-MiniLM-L12-v2. Creating a new one with MEAN pooling.
No sentence-transformers model found with name sentence-transformers/all-mpnet-base-v2. Creating a new one with MEAN pooling.


# Prepare metrics from Answer-Based Assessment

## Information

In [6]:
def information_metric(dataset):
    dataset = sentence_analysis(dataset)
    dataset['QA'] = dataset.Question + ' ' + dataset.Answer
    for index, stmodel in enumerate([droberta, minilm, mpnet]):
        question_embeds = stmodel.encode(dataset.Question)
        qa_embeds = stmodel.encode(dataset.QA)
        dataset['distances_'+str(index)] = np.diag(cosine_similarity(question_embeds, qa_embeds))
    return dataset

## DSI

In [7]:
model = BertModel.from_pretrained("bert-large-uncased", output_hidden_states = True) # initialize BERT model instance
model.eval()
segmenter = PunktSentenceTokenizer() # initialize segmenter: does sentence segmentation, returns list
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # initialize BERT tokenizer
cos = torch.nn.CosineSimilarity(dim = 0)

filter_list = np.array(['[CLS]', '[PAD]', '[SEP]', '.', ',', '!', '?'])

In [8]:
def dsi(dataset, verbose=False):
    s = {}
    # SEGMENT DATA INTO SENTENCES
    start_time = time.time()
    for ID, row in tqdm(dataset.iterrows()):
        try:
            # ID = index  # get current participant ID
            text = row["Answer"]  # get current story
            s[ID] = {}  # add dict entry for subject and create nested dict to store subject data
        
            # TRAIN SENTENCE SEGEMENTER AND SEGMENT SENTENCE
            segmenter.train(text) # train the segmenter on the text first (unsupervised algorithm that is pretrained and can improve with added training)
            sentences_from_segmenter = segmenter.tokenize(text) # apply the additionally-trained segmenter to the text
        
            # LOOP OVER SENTENCES AND GET BERT FEATURES (LAYERS 6 & 7)
            features = []  # initialize list to store dcos values, one for each sentence
            words = []
            for i in range(len(sentences_from_segmenter)):  # loop over sentences
                sentence = sentences_from_segmenter[i].translate(str.maketrans('', '', string.punctuation))
                sent_tokens = tokenizer(sentence, max_length = 50, truncation = True, padding = 'max_length', return_tensors="pt")
                sent_words = [tokenizer.decode([k]) for k in sent_tokens['input_ids'][0]]
                sent_indices = np.where(np.in1d(sent_words, filter_list, invert = True))[0]  # we'll use this to filter out special tokens and punctuation
                with torch.no_grad():
                    sent_output = model(**sent_tokens)# feed model the sentence tokens and get outputs
                    hids = sent_output.hidden_states # isolate hidden layer activations
                layer6 = hids[6] # isolate layer 6 hidden activations
                layer7 = hids[7] # do the same for layer 7
        
                for j in sent_indices:  # loop over words and create list of all hidden vectors from layers 6 & 7; additionally store number of words (doubled, to account for layer 6 and 7 duplicates)
                    words.append(sent_words[j])
                    words.append(sent_words[j])
                    features.append(layer6[0,j,:])  # layer 6 features
                    features.append(layer7[0,j,:])  # layer 7 features
        
            # GET DCOS VALUES FOR STORY
            num_words = len(words) # number of words, in terms of hidden activation vectors (2*words)
            lower_triangle_indices = np.tril_indices_from(np.random.rand(num_words, num_words), k = -1)  # creates a matrix that represents words*2 (i.e., from word representations from both layer 6+7) and gets the indices of the lower triangle, omitting diagonal (k = -1)A
            story_dcos_vals = []  # intialize storage for dcos of current sentence
            for k in range(len(lower_triangle_indices[0])): # loop over lower triangle indices
                features1 = features[lower_triangle_indices[0][k]]
                features2 = features[lower_triangle_indices[1][k]]
                dcos = (1-cos(features1, features2))  # compute dcos
                story_dcos_vals.append(dcos) # store dcos value in list
        
            mean_story_dcos = torch.mean(torch.stack(story_dcos_vals)).item()  # get average story dcos
            s[ID]["DSI"] = mean_story_dcos
        except:
            continue
    
    # MERGE OUTPUT WITH INPUT DATAFRAME
    dsi_df = pd.DataFrame.from_dict(s, orient = "index") # make pandas dataframe from DSI dictionary
    # dsi_df["ID"] = dsi_df.index
    # dsi_df.to_csv('DSI_output.csv', index = False) # save updated dataframe
    if verbose:
        elapsed_time = time.time()-start_time # get elapsed time to compute DSI values
        print('elapsed time: ' + str(elapsed_time)) # display elapsed time (in seconds)
    return dsi_df

# Compute metrics

In [22]:
for number in range(1, 9):
    print(number)
    ds = load_ds(number).reset_index()
    ds_inf = information_metric(ds)
    ds_inf.to_csv(filepath_inf.format(number=number), sep=',')
    ds_dsi = dsi(ds)
    ds_dsi.to_csv(filepath_dsi.format(number=number), sep=';')

1


14it [02:33, 10.95s/it]


2


11it [00:39,  3.62s/it]


3


8it [18:09, 136.15s/it]


4


11it [03:54, 21.29s/it]


5


5it [01:14, 14.93s/it]


6


16it [03:05, 11.62s/it]


7


16it [14:24, 54.06s/it]


8


4it [00:22,  5.72s/it]


In [25]:
for number in range(1, 9):
    print(number)
    ds = load_ds(number).reset_index()
    ds_mad = mad_dataset(ds)
    ds_mad.to_csv(filepath_mad.format(number=number), sep=';')

1
2
3
4
5
6
7
8
