In [76]:
import pandas as pd
from transformers import BertTokenizer
import spacy
import numpy as np
from collections import Counter
from tqdm import tqdm
import pickle

In [2]:
response_df = pd.read_csv('final_data.csv')

In [4]:
response_df['stylometry_vector'] = None

In [7]:
nlp = spacy.load('en_core_web_lg')

In [16]:
def get_hapax_legomena(df):
    
    total_word_counts = Counter()
    
    for response in df['response_text']:
        words = response.split()  
        total_word_counts.update(words) 
    
    hapax_legomena = {word for word, count in total_word_counts.items() if count == 1}
    
    return hapax_legomena
hapax_legomena_dict = {word:0 for word in get_hapax_legomena(response_df)}

In [73]:
def get_features_vector(response):

    # Get average word length
    num_words = len(response.split())
    running_total = sum(len(word) for word in response.split())
    average_word_len = running_total/num_words

    # Get function word usage, stopword usage, active/passive, and present/past
    pos_list = ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", 
                "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"]
    pos_dict = {pos:0 for pos in pos_list}
    doc = nlp(response)
    num_tokens = len(doc)
    num_stopwords = 0
    active = 1
    present = 1
    for token in doc:
        type_ = token.pos_
        pos_dict[type_] += 1
        if token.is_stop:
            num_stopwords += 1
        if token.dep_ =='nsubjpass':
            active = 0
        if type_ == "VERB":
            if token.tag_ in ["VBD", "VBN"]:
                present = 0
    stopword_ratio = num_stopwords/num_tokens
    
    # Get MTTR
    pointer_left = 0
    pointer_right = 4
    TTRs = []
    while pointer_right < len(doc):
        text_window = doc[pointer_left:pointer_right]
        TTRs.append(len(set(text_window))/len(text_window))
        pointer_left += 1
        pointer_right += 1
    MTTR = np.mean(TTRs)
    
    # Get Hapax Legomena
    #hapax_dict = hapax_legomena_dict.copy()
    #for word in response.split():
     #   if word in hapax_dict.keys():
      #      hapax_dict[word] += 1
    
    pos_vals = np.array(list(pos_dict.values()))
    #hapax_legomana_vals = np.array(list(hapax_dict.values()))
    lexical_vector = np.concatenate(([average_word_len, stopword_ratio, MTTR], pos_vals))

    # Get syntatic_features

    # Get average sentence length
    num_sents = len([sent for sent in doc.sents])
    num_words = len([token for token in doc if not token.is_punct])
    avg_word_per_sent = num_words/num_sents

    syntactic_vector = np.array([avg_word_per_sent, active, present])

    # Get structural features
    paragraphs = response.split('\n\n')
    num_paragraphs = len(paragraphs)
    sentences_per_paragraph = num_sents/num_paragraphs
    words_per_paragraph = num_words/num_paragraphs

    num_puncts = len([token for token in doc if token.is_punct])
    puncts_per_word = num_puncts/num_words

    num_capitals = len([token for token in doc if token.text[0].isupper()])
    capitals_per_word = num_capitals/num_words

    structural_vector = np.array((sentences_per_paragraph, words_per_paragraph, puncts_per_word, capitals_per_word))
    
    final_vector = np.concatenate([lexical_vector, syntactic_vector, structural_vector])
    norm = np.linalg.norm(final_vector)
    final_vector_normalized = final_vector/norm
    return final_vector_normalized


In [74]:
feature_vectors = []
for i, row in tqdm(response_df.iterrows()):
    response = row['response_text']
    feature_vector = get_features_vector(response)
    feature_vectors.append(feature_vector)

14637it [08:23, 29.09it/s] 


In [78]:
with open("feature_vectors.pkl", "wb") as f:
    pickle.dump(feature_vectors, f)

In [79]:
print(len(feature_vectors[0]))

29
