## This file is for creating normalized version of the 780-dim feature vector

#### (see below after the paraphrase part)

In [1]:
import joblib
import numpy as np

### Preprocess paraphrase vector to be 780-dim

In [2]:
x_ease = joblib.load('essay_ease_spelling_unique_416_paraphrase')
x_sbert = joblib.load('s-bert_asap5_paraphrased_mean_paragraph')
x_sbert = np.array(x_sbert)
bow_similarity = joblib.load('bow_similarity_asap5_paraphrase')
lang_error = joblib.load('errors_asap5_paraphrase').reshape(-1,1)
sbert_similarity = joblib.load('sbert_similarity_paraphrase')

In [3]:
x_12 = x_ease[:, :12]
x_12.shape

(1805, 12)

In [4]:
x_bag = x_ease[:, 12:]
x_bag.shape

(1805, 404)

In [5]:
x_spell_error = x_ease[:, -2].reshape(-1,1)
x_unique_words = x_ease[:, -1].reshape(-1,1)

In [6]:
# columns 2,3,4 --> comma counts, Apostrophe counts, other punctuation counts
# we sum all of the punctuation counts
x_punc_counts = np.sum(x_12[:, [2,3,4]], axis=1)
x_punc_counts = x_punc_counts.reshape(-1, 1)

In [7]:
# X_ease_nobag
x_6 = np.delete(x_12, [2,3,4,7,9,11], axis=1)
print(x_6.shape)
x_6[:3]

(1805, 6)


array([[ 830.        ,  168.        ,    4.12056738,  244.5       ,
         108.        ,   43.        ],
       [1062.        ,  209.        ,    4.17368421,  306.5       ,
         118.        ,   54.        ],
       [ 722.        ,  131.        ,    4.71929825,  189.5       ,
          78.        ,   37.        ]])

In [8]:
x_10 = np.concatenate((x_6, x_punc_counts, x_spell_error, x_unique_words, sbert_similarity), axis=1)
x_10.shape

(1805, 10)

In [9]:
x = np.concatenate((x_10, bow_similarity, lang_error, x_sbert), axis=1)
x.shape

(1805, 780)

### save

In [10]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase']

### load file to normalize

In [11]:
x = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
x.shape

(1805, 780)

## Normalize some features

### 1. Spelling error / word count

In [12]:
spell_error = x[:,7]
word_count = x[:,1]
spell_error_normalized = spell_error / word_count
spell_error_normalized

array([0.04761905, 0.03349282, 0.06870229, ..., 0.06074766, 0.05095541,
       0.04716981])

In [13]:
x[:,7] = spell_error_normalized

### 2. Language error / answer length

In [14]:
lang_error = x[:,11]
answer_length = x[:,0]
print(lang_error)
lang_error_normalized = lang_error / answer_length
print(lang_error_normalized)

[ 7.  8. 13. ... 20. 12.  6.]
[0.00843373 0.00753296 0.01800554 ... 0.01784121 0.01515152 0.01054482]


In [15]:
x[:,11] = lang_error_normalized

### 3. Punctuation count / answer length

In [None]:
punc_count = x[:,6]
answer_length = x[:,0]
print(punc_count)
punc_count_normalized = punc_count / answer_length
print(punc_count_normalized)

In [None]:
x[:,6] = punc_count_normalized

### 4. Good n-gram / answer length

In [None]:
good_ngram = x[:,3]
answer_length = x[:,0]
print(good_ngram)
good_ngram_normalized = good_ngram / answer_length
print(good_ngram_normalized)

In [None]:
x[:,3] = good_ngram_normalized

### save

In [16]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap5')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap5']

### check

In [17]:
a = x
a.shape

(1805, 780)

In [18]:
b = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap5')

In [19]:
a == b

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [20]:
type(a)

numpy.ndarray

In [21]:
(a==b).all()

True

In [22]:
### CHECK FIRST 12 FEATURES (INTERPRETABLE)
aa = joblib.load('essay_ease10_sbert768_simbow_langerr_780_normalized_asap5')
bb = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
cc = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap5')

In [27]:
aa[:2,:12]

array([[7.61000000e+02, 1.60000000e+02, 3.94029851e+00, 2.32500000e+02,
        1.14000000e+02, 4.90000000e+01, 3.00000000e+01, 4.37500000e-02,
        7.50000000e+01, 8.72380018e-01, 8.17062524e-01, 9.19842313e-03],
       [9.33000000e+02, 1.89000000e+02, 3.96470588e+00, 2.77000000e+02,
        1.21000000e+02, 5.10000000e+01, 2.30000000e+01, 3.70370370e-02,
        9.90000000e+01, 8.72457147e-01, 6.84573461e-01, 6.43086817e-03]])

In [28]:
bb[:2, :12]

array([[8.30000000e+02, 1.68000000e+02, 4.12056738e+00, 2.44500000e+02,
        1.08000000e+02, 4.30000000e+01, 3.00000000e+01, 8.00000000e+00,
        8.40000000e+01, 8.52537870e-01, 9.15484122e-01, 7.00000000e+00],
       [1.06200000e+03, 2.09000000e+02, 4.17368421e+00, 3.06500000e+02,
        1.18000000e+02, 5.40000000e+01, 2.30000000e+01, 7.00000000e+00,
        1.11000000e+02, 8.66617024e-01, 8.96781334e-01, 8.00000000e+00]])

In [29]:
cc[:2, :12]

array([[8.30000000e+02, 1.68000000e+02, 4.12056738e+00, 2.44500000e+02,
        1.08000000e+02, 4.30000000e+01, 3.00000000e+01, 4.76190476e-02,
        8.40000000e+01, 8.52537870e-01, 9.15484122e-01, 8.43373494e-03],
       [1.06200000e+03, 2.09000000e+02, 4.17368421e+00, 3.06500000e+02,
        1.18000000e+02, 5.40000000e+01, 2.30000000e+01, 3.34928230e-02,
        1.11000000e+02, 8.66617024e-01, 8.96781334e-01, 7.53295669e-03]])