In [1]:
import pandas as pd
import numpy as np
import re
from scipy.stats import lognorm, skew, kurtosis, entropy

from src.data.extract_scrub_essay_text import extract, scrub_activity, scrub_text_change, concatenate_essay_from_logs

In [2]:
PATH_TRAIN_LOGS = "./data/external/train_logs.csv"

In [3]:
X = extract(PATH_TRAIN_LOGS)
X = scrub_activity(X)
X = scrub_text_change(X)

In [4]:
essays_text = pd.concat(
    [concatenate_essay_from_logs(x) for _, x in X.groupby('id')], axis=0
    ).reset_index(drop=True)

# two consecutive newlines constitute one effective
# no paragraph breaks imply, all 1 paragraph
essays_text['n_paragraphs'] = essays_text['essay'].str.count("[\n]+")
essays_text.loc[essays_text['n_paragraphs'] == 0, 'n_paragraphs'] = 1
essays_text['paragraphs'] = essays_text['essay'].str.split("[\n]+")
essays_text['n_sentences_by_paragraph'] = (
    essays_text['paragraphs']
    .apply(lambda paragraphs: [len(re.findall("[\.]+|[?]+|[!]+", p)) for p in paragraphs])
    )
# for bounds guidance, see overall distribution
varnames_n_paragraphs_by_n_sentences_bin = []
for geq_low, lt_high in [
    (0, 2),
    (2, 3),
    (3, 4),
    (4, 5),
    (5, 6),
    (6, 7),
    (7, 10),
    (10, 20),
    (20, 50)
    ]:

    bin_var = f'n_paragraphs_with_n_sentences_geq{geq_low}_lt{lt_high}'
    varnames_n_paragraphs_by_n_sentences_bin += [bin_var, bin_var + "_frac"]

    essays_text[bin_var] = (
        essays_text['n_sentences_by_paragraph']
        .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
        )
    
    essays_text[bin_var + "_frac"] = (
        essays_text[bin_var] / essays_text['n_paragraphs']
        )


# sentences split can leave last hanging ' ', if not scrubbed by search for 'q'
essays_text['sentences'] = essays_text['essay'].str.split("[\.]+|[?]+|[!]+")
essays_text['sentences'] = (
    essays_text['sentences']
    .apply(lambda sentences: [s for s in sentences if 'q' in s])
)
essays_text['n_sentences'] = (
    essays_text['sentences']
    .apply(lambda s_split: len(s_split))
)

essays_text['words_by_sentence'] = (
    essays_text['sentences']
    .apply(lambda sentences: [s.split() for s in sentences])
)
essays_text['i_words_by_sentence'] = (
    essays_text['words_by_sentence']
    .apply(lambda sentences: np.array([len(s) for s in sentences]))
)

# for bounds guidance, see overall distribution
varnames_n_sentences_by_word_count_bin = []
for geq_low, lt_high in [
    (0, 5),
    (5, 10),
    (10, 15),
    (15, 20),
    (20, 25),
    (25, 30),
    (30, 50),
    (50, 5000)
    ]:

    bin_var = f'n_sentences_words_geq{geq_low}_lt{lt_high}'
    varnames_n_sentences_by_word_count_bin += [bin_var, bin_var + "_frac"]

    essays_text[bin_var] = (
        essays_text['i_words_by_sentence']
        .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
        )
    
    essays_text[bin_var + "_frac"] = (
        essays_text[bin_var] / essays_text['n_sentences']
        )


essays_text['words'] = essays_text['essay'].str.split(" +", regex=True)
essays_text["word_count_reconstructed"] = (
    essays_text
    ["words"]
    .apply(lambda x: len(x))
)
essays_text["words_length"] = (
    essays_text["words"]
    .apply(lambda x: np.array([len(a) for a in x]))
)

# for bounds guidance, see distribution of word lengths
varnames_i_words_by_length_bin = []
for geq_low, lt_high in [
    (0, 2),
    (2, 3),
    (3, 4),
    (4, 5),
    (5, 6),
    (6, 7),
    (7, 8),
    # "incomprehensible" is a reasonable, long (21-char) word
    (8, 25),
    (25, 500)
]:
    bin_var = f'words_length_geq{geq_low}_lt{lt_high}'
    varnames_i_words_by_length_bin += [bin_var, bin_var + "_frac"]

    essays_text[bin_var] = (
        essays_text['words_length']
        .apply(lambda x: ( (x >= geq_low) & (x < lt_high) ).sum() )
        )
    essays_text[bin_var + "_frac"] = (
        essays_text[bin_var] / essays_text['word_count_reconstructed']
        )


essays_text['n_thought_delimiting_punctuation'] = (
    essays_text
    ['essay']
    .str
    .count("[\.]+|[?]+|[!]+|[,]+|[-]+|[;]+|[:]+|[â€”]+")
    )
essays_text["words_per_thought_delimiting_punctuation_avg"] = (
    essays_text["word_count_reconstructed"] / 
    essays_text['n_thought_delimiting_punctuation']
)

essays_text['n_parenthetical_punctuation'] = (
    essays_text
    ['essay']
    .str
    .count("\(|\)|\[|\]|\*|{|}")
)

essays_text['n_quant_punctuation'] = (
    essays_text
    ['essay']
    .str
    .count("=|>|<|\$|\%|\+")
)

essays_text['n_apostrophe'] = (
    essays_text
    ['essay']
    .str
    .count("'")
)

essays_text['n_quotes'] = (
    essays_text
    ['essay']
    .str
    .count("\"")
)

essays_text['n_shortening_punctuation'] = (
    essays_text
    ['essay']
    .str
    .count("&|@")
)

for var in ['i_words_by_sentence', 'words_length']:
    essays_text[f"{var}_stddev"] = essays_text[var].apply(lambda x: x.std())


In [None]:
essays_text['n_sentences_by_paragraph']

In [6]:
words_length = np.concatenate(
    [essays_text['words_length'][i] for i in range(essays_text.shape[0])]
)
words_length.shape

(958229,)

In [7]:
pd.Series(words_length).quantile([x/10 for x in range(10+1)])
# pd.Series(words_length).quantile([0.9, 0.95, 0.98, 0.99])
# <= 2
# 3
# 4
# 5
# 6
# 7
# >= 8

0.0      0.0
0.1      2.0
0.2      2.0
0.3      3.0
0.4      4.0
0.5      4.0
0.6      5.0
0.7      6.0
0.8      7.0
0.9      9.0
1.0    373.0
dtype: float64

In [8]:
n_words_by_sentence = np.concatenate(
    [essays_text['n_words_by_sentence'][i] for i in range(essays_text.shape[0])]
)
n_words_by_sentence.shape

(52743,)

In [9]:
pd.Series(n_words_by_sentence).quantile([x/10 for x in range(10+1)])

0.0      1.0
0.1      7.0
0.2     10.0
0.3     12.0
0.4     14.0
0.5     16.0
0.6     19.0
0.7     22.0
0.8     25.0
0.9     31.0
1.0    303.0
dtype: float64

In [10]:
essays_text[
    [x for x in varnames_n_sentences_by_word_count_bin if '_frac' in x]
    ].sum(axis=1).describe()

count    2.471000e+03
mean     1.000000e+00
std      6.638074e-17
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [11]:
essays_text[
    [x for x in varnames_n_words_by_length_bin if '_frac' in x]
    ].sum(axis=1).describe()

count    2.471000e+03
mean     1.000000e+00
std      7.670411e-17
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [12]:
n_sentences_by_paragraph = np.concatenate(
    [essays_text['n_sentences_by_paragraph'][i] for i in range(essays_text.shape[0])]
)
n_sentences_by_paragraph.shape

(12090,)

In [13]:
pd.Series(n_sentences_by_paragraph).quantile([x/10 for x in range(10+1)])

0.0     0.0
0.1     1.0
0.2     2.0
0.3     3.0
0.4     3.0
0.5     4.0
0.6     5.0
0.7     5.0
0.8     6.0
0.9     8.0
1.0    34.0
dtype: float64