## This file is for creating normalized version of the 780-dim feature vector

#### (see below after the paraphrase part)

In [1]:
import joblib
import numpy as np

### Preprocess paraphrase vector to be 780-dim

In [2]:
x_ease = joblib.load('essay_ease_spelling_unique_416_paraphrase')
x_sbert = joblib.load('s-bert_asap7_paraphrased_mean_paragraph')
x_sbert = np.array(x_sbert)
bow_similarity = joblib.load('bow_similarity_asap7_paraphrase')
lang_error = joblib.load('errors_asap7_paraphrase').reshape(-1,1)
sbert_similarity = joblib.load('sbert_similarity_paraphrase')

In [3]:
x_12 = x_ease[:, :12]
x_12.shape

(1569, 12)

In [4]:
x_bag = x_ease[:, 12:]
x_bag.shape

(1569, 404)

In [5]:
x_spell_error = x_ease[:, -2].reshape(-1,1)
x_unique_words = x_ease[:, -1].reshape(-1,1)

In [6]:
# columns 2,3,4 --> comma counts, Apostrophe counts, other punctuation counts
# we sum all of the punctuation counts
x_punc_counts = np.sum(x_12[:, [2,3,4]], axis=1)
x_punc_counts = x_punc_counts.reshape(-1, 1)

In [7]:
# X_ease_nobag
x_6 = np.delete(x_12, [2,3,4,7,9,11], axis=1)
print(x_6.shape)
x_6[:3]

(1569, 6)


array([[599.        , 116.        ,   4.16161616, 168.        ,
         24.        ,  15.        ],
       [525.        , 111.        ,   3.75257732, 160.5       ,
         24.        ,  21.        ],
       [847.        , 177.        ,   3.9068323 , 257.5       ,
         39.        ,  35.        ]])

In [8]:
x_10 = np.concatenate((x_6, x_punc_counts, x_spell_error, x_unique_words, sbert_similarity), axis=1)
x_10.shape

(1569, 10)

In [9]:
x = np.concatenate((x_10, bow_similarity, lang_error, x_sbert), axis=1)
x.shape

(1569, 780)

### save

In [10]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase']

### load file to normalize

In [11]:
x = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
x.shape

(1569, 780)

## Normalize some features

### 1. Spelling error / word count

In [12]:
spell_error = x[:,7]
word_count = x[:,1]
spell_error_normalized = spell_error / word_count
spell_error_normalized

array([0.05172414, 0.09009009, 0.05084746, ..., 0.02714932, 0.02762431,
       0.01015228])

In [13]:
x[:,7] = spell_error_normalized

### 2. Language error / answer length

In [14]:
lang_error = x[:,11]
answer_length = x[:,0]
print(lang_error)
lang_error_normalized = lang_error / answer_length
print(lang_error_normalized)

[25.  9. 17. ...  8. 29.  2.]
[0.04173623 0.01714286 0.02007084 ... 0.00746269 0.0172619  0.00225225]


In [15]:
x[:,11] = lang_error_normalized

### 3. Punctuation count / answer length

In [None]:
punc_count = x[:,6]
answer_length = x[:,0]
print(punc_count)
punc_count_normalized = punc_count / answer_length
print(punc_count_normalized)

In [None]:
x[:,6] = punc_count_normalized

### 4. Good n-gram / answer length

In [None]:
good_ngram = x[:,3]
answer_length = x[:,0]
print(good_ngram)
good_ngram_normalized = good_ngram / answer_length
print(good_ngram_normalized)

In [None]:
x[:,3] = good_ngram_normalized

### save

In [16]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap7')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap7']

### check

In [17]:
a = x
a.shape

(1569, 780)

In [18]:
b = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap7')

In [19]:
a == b

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [20]:
type(a)

numpy.ndarray

In [21]:
(a==b).all()

True

In [22]:
### CHECK FIRST 12 FEATURES (INTERPRETABLE)
aa = joblib.load('essay_ease10_sbert768_simbow_langerr_780_normalized_asap7')
bb = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
cc = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap7')

In [23]:
aa[:2,:12]

array([[4.97000000e+02, 9.90000000e+01, 4.00000000e+00, 1.42500000e+02,
        2.40000000e+01, 1.90000000e+01, 1.60000000e+01, 6.06060606e-02,
        5.30000000e+01, 5.42968094e-01, 2.51557647e-01, 4.82897384e-02],
       [4.68000000e+02, 1.05000000e+02, 3.46153846e+00, 1.51500000e+02,
        2.70000000e+01, 2.50000000e+01, 9.00000000e+00, 9.52380952e-02,
        5.20000000e+01, 2.97653049e-01, 3.05887645e-01, 2.35042735e-02]])

In [24]:
bb[:2, :12]

array([[5.99000000e+02, 1.16000000e+02, 4.16161616e+00, 1.68000000e+02,
        2.40000000e+01, 1.50000000e+01, 1.60000000e+01, 6.00000000e+00,
        6.40000000e+01, 5.56997418e-01, 3.16058054e-01, 2.50000000e+01],
       [5.25000000e+02, 1.11000000e+02, 3.75257732e+00, 1.60500000e+02,
        2.40000000e+01, 2.10000000e+01, 1.00000000e+01, 1.00000000e+01,
        5.70000000e+01, 2.77561724e-01, 2.61345270e-01, 9.00000000e+00]])

In [25]:
cc[:2, :12]

array([[5.99000000e+02, 1.16000000e+02, 4.16161616e+00, 1.68000000e+02,
        2.40000000e+01, 1.50000000e+01, 1.60000000e+01, 5.17241379e-02,
        6.40000000e+01, 5.56997418e-01, 3.16058054e-01, 4.17362270e-02],
       [5.25000000e+02, 1.11000000e+02, 3.75257732e+00, 1.60500000e+02,
        2.40000000e+01, 2.10000000e+01, 1.00000000e+01, 9.00900901e-02,
        5.70000000e+01, 2.77561724e-01, 2.61345270e-01, 1.71428571e-02]])