In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

2023-11-19 19:18:00.248937: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-19 19:18:00.279313: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-19 19:18:00.279334: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-19 19:18:00.280254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-19 19:18:00.285025: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-19 19:18:00.285338: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
PATH_TRAIN_LOGS = "./data/external/train_logs.csv"

In [3]:
def extract(path):

    X = pd.read_csv(path)
    X = X.sort_values(["id", "event_id"], ascending=[True, True])
    
    return X

def scrub_activity(X):

    # 'Move From' activity recorded with low-level cursor loc details
    # extract bigger-picture 'Move From'
    # QUESTION: what's the difference between Move From, and a cut+paste?
    X['activity_detailed'] = X['activity']
    X.loc[X['activity'].str.contains('Move From'), 'activity'] = 'Move'

    return X

def scrub_text_change(X):
    """
    Problems with initial text data:

    - Some hex expressions (\\xHH) not decoded. Instead, written literally.
        - Examples: emdash (\\x96), slanted quotations & ticks.
        
    - Some foreign characters (accent a, overring a) not anonymized with generic q.
    Problem confirmed via Kaggle data viewer, for id-event_id cases like 
    0916cdad-39 or 9f328eb3-19. Solutions:
        - An Input event cannot include multiple characters: 
        foreign character & something else. 
        Then, 
            - If Input event contains any emdash, overwrite as strictly emdash
            - If Input event contains no emdash & foreign character, overwrite with single q
            - If Move event, replace any foreign character with single q
    """

    X['text_change_original'] = X['text_change']

    # expect this transforms all \xHH literals
    X['text_change'] = (
        X
        ['text_change_original']
        # arrived at utf-8 encode, windows-1252 decode after several iterations.
        # tested latin-1, but not all \xHH instances caught.
        # tested utf-16, just rose errors.
        .apply(lambda x: x.encode(encoding='utf-8').decode("windows-1252"))
    )


    is_text_change_decode_english = (
        X['text_change'].apply(lambda x: x.isascii())
    )

    is_input_event_foreign_any_emdash = (
        (~ is_text_change_decode_english)
        & (X['activity'] == "Input") 
        & (X['text_change'].str.contains("—"))
    )
    X.loc[is_input_event_foreign_any_emdash, 'text_change'] = "—"

    is_input_event_foreign_no_overwrite = (
        (~ is_text_change_decode_english)
        & (X['activity'] == "Input")
        & (~ X['text_change'].str.contains("—"))
    )
    X.loc[is_input_event_foreign_no_overwrite, 'text_change'] = 'q'


    # given block text change, proceed one character at a time,
    # replacing foreign ones 
    def anonymize_non_ascii(x):
        value = ""
        for x_i in x:
            if not x_i.isascii():
                value += "q"
            else:
                value += x_i
        return value

    X['text_change'] = np.where(
        X['activity'].str.contains('Move|Remove|Paste|Replace', regex=True),
        X['text_change'].apply(lambda x: anonymize_non_ascii(x)),
        X['text_change']
    )

    return X

def concatenate_essay_from_logs(df):
    """
    Concatenate essay text from disparate logged input events.
    Expect df to be *one* author's log.
    Adapted from sources: 
        https://www.kaggle.com/code/hiarsl/feature-engineering-sentence-paragraph-features,
        https://www.kaggle.com/code/kawaiicoderuwu/essay-contructor.
    """

    input_events = df.loc[
        (df.activity != 'Nonproduction'), 
        ['activity_detailed', 'cursor_position', 'text_change']
        ].rename(columns={'activity_detailed': 'activity'})

    essay_text = ""
    for input_event in input_events.values:

        activity = input_event[0]
        cursor_position_after_event = input_event[1]
        text_change_log = input_event[2]

        if activity == 'Replace':

            replace_from_to = text_change_log.split(' => ')
            text_add = replace_from_to[1]
            text_remove = replace_from_to[0]
            cursor_position_start_text_change = (
                cursor_position_after_event - len(text_add)
                )
            cursor_position_after_skip_replace = (
                cursor_position_start_text_change + len(text_remove)
            )

            # essayText start: "the blue cat"
            # replace "blue" with "red"
            # "the redblue cat", skip blue
            essay_text = (
                essay_text[:cursor_position_start_text_change] # "the "
                + text_add # "red"
                # essayText value: "the blue cat" 
                # want remaining " cat", NOT "blue cat"
                + essay_text[cursor_position_after_skip_replace:] 
                )

            continue

        if activity == 'Paste':

            cursor_position_start_text_change = (
                cursor_position_after_event - len(text_change_log)
                )

            # essayText start: "the cat"
            # paste "blue " between
            essay_text = (
                essay_text[:cursor_position_start_text_change] # "the " 
                + text_change_log # "blue "
                # essayText value: "the cat"
                + essay_text[cursor_position_start_text_change:]
            )

            continue

        if activity == 'Remove/Cut':
            # similar process to "Replace" action

            text_remove = text_change_log
            cursor_position_after_skip_remove = (
                cursor_position_after_event + len(text_remove)
            )

            essay_text = (
                essay_text[:cursor_position_after_event] 
                + essay_text[cursor_position_after_skip_remove:]
                )

            continue
        
        if "Move" in activity:

            cursor_intervals_raw_str = (
                activity[10:]
                .replace("[", "")
                .replace("]", "")
                )
            cursor_intervals_separate = cursor_intervals_raw_str.split(' To ')
            cursor_intervals_vectors = [
                x.split(', ') 
                for x in cursor_intervals_separate
                ]
            cursor_interval_from = [
                int(x) for x in cursor_intervals_vectors[0]
                ]
            cursor_interval_to = [
                int(x) for x in cursor_intervals_vectors[1]
                ]

            # "the blue cat ran", move "blue" to
            # "the cat blue ran"
            # note: no change in total text length

            if cursor_interval_from[0] != cursor_interval_to[0]:

                if cursor_interval_from[0] < cursor_interval_to[0]:
                    
                    essay_text = (
                        # all text preceding move-impacted window
                        essay_text[:cursor_interval_from[0]] +
                        # skip where moved block _was_,
                        # proceed to end of move-impacted window
                        essay_text[cursor_interval_from[1]:cursor_interval_to[1]] +
                        # add moved block
                        essay_text[cursor_interval_from[0]:cursor_interval_from[1]] + 
                        # all text proceeding move-impacted window
                        essay_text[cursor_interval_to[1]:]
                    )

                # "the cat ran fast", move "ran" to 
                # "ran the cat fast"
                else:

                    essay_text = (
                        # all text preceding move-impacted window
                        essay_text[:cursor_interval_to[0]] + 
                        # add moved block
                        essay_text[cursor_interval_from[0]:cursor_interval_from[1]] +
                        # skip moved block, still within move-impacted window
                        essay_text[cursor_interval_to[0]:cursor_interval_from[0]] + 
                        # all text proceeding move-impacted window
                        essay_text[cursor_interval_from[1]:]
                    )
      
            continue
        

        cursor_position_start_text_change = (
            cursor_position_after_event - len(text_change_log)
            )
        essay_text = (
            essay_text[:cursor_position_start_text_change] 
            + text_change_log
            + essay_text[cursor_position_start_text_change:]
            )
        
    return pd.DataFrame({'id': df['id'].unique(), 'essay': [essay_text]})

In [4]:
X_train_logs = extract(PATH_TRAIN_LOGS)
X_train_logs = scrub_activity(X_train_logs)
X_train_logs = scrub_text_change(X_train_logs)

X_train_logs = [x for _, x in X_train_logs.groupby('id')]
essays_text = pd.concat(
    [concatenate_essay_from_logs(x) for x in X_train_logs],
    axis=0
)
# keras TextVectorization does not recognize emdash as punctuation
essays_text['essay'] = essays_text['essay'].str.replace("—", " ") 

y = pd.read_csv("./data/external/train_scores.csv")
y.rename(columns={'score': 'y'}, inplace=True)
XY = pd.merge(essays_text, y, how='left')
X, y = XY['essay'].to_numpy(), XY['y'].to_numpy()
X, X_test, y, y_test = train_test_split(X, y, test_size=0.33, random_state=777)

In [5]:
BATCH_SIZE = 32

# in tf Dataset structure, one element is one X-y pair 
XY_train = tf.data.Dataset.from_tensor_slices((X, y)).batch(BATCH_SIZE)
X_train = XY_train.map(lambda x, y: x)

XY_test = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(BATCH_SIZE)
X_test = XY_test.map(lambda x, y: x)

In [6]:
text_vectorization = tf.keras.layers.TextVectorization(
    # with anonymized text, downscale recommended vocabulary size by magnitude 
    max_tokens=20000,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=2,
    output_mode='tf_idf'
    )

text_vectorization.adapt(X_train)
# values = text_vectorization.get_vocabulary()

tfidf_XY_train = XY_train.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

tfidf_XY_test = XY_test.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

In [7]:
n_tokens = text_vectorization.vocabulary_size()
n_tokens

397

In [30]:
values = text_vectorization.get_vocabulary()

In [31]:
inputs = keras.Input(shape=(n_tokens,))
x = keras.layers.Dense(32, activation="relu")(inputs)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

# model = keras.Sequential([
#     keras.layers.Dense(16, activation='relu'),
#     keras.layers.Dense(1)
#     ])

model.compile(
    optimizer="rmsprop",
    loss="mean_squared_error"
)
# model.summary()

In [32]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

model.fit(
    tfidf_XY_train.cache(),
    validation_data=tfidf_XY_test.cache(),
    epochs=100,
    callbacks=[callback]
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


<keras.src.callbacks.History at 0x7f1f5b8e3ca0>

In [33]:
# ngram=4:
    # with punctuation: validation mse min is 0.6
    # same without punctuation

# ngram=2
    # losing punctuation: again, mse ~0.6
    # with punctuation: mse worsens
