## This file is for creating normalized version of the 780-dim feature vector

#### (see below after the paraphrase part)

In [2]:
import joblib
import numpy as np

### Preprocess paraphrase vector to be 780-dim

In [2]:
x_ease = joblib.load('essay_ease_spelling_unique_416_paraphrase')
x_sbert = joblib.load('s-bert_asap4_paraphrased_mean_paragraph')
x_sbert = np.array(x_sbert)
bow_similarity = joblib.load('bow_similarity_asap4_paraphrase')
lang_error = joblib.load('errors_asap4_paraphrase').reshape(-1,1)
sbert_similarity = joblib.load('sbert_similarity_paraphrase')

In [3]:
x_12 = x_ease[:, :12]
x_12.shape

(1772, 12)

In [4]:
x_bag = x_ease[:, 12:]
x_bag.shape

(1772, 404)

In [5]:
x_spell_error = x_ease[:, -2].reshape(-1,1)
x_unique_words = x_ease[:, -1].reshape(-1,1)

In [6]:
# columns 2,3,4 --> comma counts, Apostrophe counts, other punctuation counts
# we sum all of the punctuation counts
x_punc_counts = np.sum(x_12[:, [2,3,4]], axis=1)
x_punc_counts = x_punc_counts.reshape(-1, 1)

In [7]:
# X_ease_nobag
x_6 = np.delete(x_12, [2,3,4,7,9,11], axis=1)
print(x_6.shape)
x_6[:3]

(1772, 6)


array([[337.        ,  67.        ,   4.06557377,  94.        ,
         38.        ,  27.        ],
       [225.        ,  45.        ,   3.57142857,  62.5       ,
         27.        ,  13.        ],
       [894.        , 166.        ,   4.58940397, 244.        ,
         98.        ,  51.        ]])

In [8]:
x_10 = np.concatenate((x_6, x_punc_counts, x_spell_error, x_unique_words, sbert_similarity), axis=1)
x_10.shape

(1772, 10)

In [9]:
x = np.concatenate((x_10, bow_similarity, lang_error, x_sbert), axis=1)
x.shape

(1772, 780)

### save

In [10]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase']

### load file to normalize

In [12]:
x = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
x.shape

(1772, 780)

## Normalize some features

### 1. Spelling error / word count

In [13]:
spell_error = x[:,7]
word_count = x[:,1]
spell_error_normalized = spell_error / word_count
spell_error_normalized

array([0.05970149, 0.22222222, 0.03614458, ..., 0.08571429, 0.11363636,
       0.03529412])

In [14]:
x[:,7] = spell_error_normalized

### 2. Language error / answer length

In [15]:
lang_error = x[:,11]
answer_length = x[:,0]
print(lang_error)
lang_error_normalized = lang_error / answer_length
print(lang_error_normalized)

[ 4. 11.  4. ...  8.  5.  2.]
[0.01186944 0.04888889 0.00447427 ... 0.02116402 0.02314815 0.0045045 ]


In [16]:
x[:,11] = lang_error_normalized

### 3. Punctuation count / answer length

In [None]:
punc_count = x[:,6]
answer_length = x[:,0]
print(punc_count)
punc_count_normalized = punc_count / answer_length
print(punc_count_normalized)

In [None]:
x[:,6] = punc_count_normalized

### 4. Good n-gram / answer length

In [None]:
good_ngram = x[:,3]
answer_length = x[:,0]
print(good_ngram)
good_ngram_normalized = good_ngram / answer_length
print(good_ngram_normalized)

In [None]:
x[:,3] = good_ngram_normalized

### save

In [17]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap4')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap4']

### check

In [18]:
a = x
a.shape

(1772, 780)

In [19]:
b = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap4')

In [20]:
a == b

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [21]:
type(a)

numpy.ndarray

In [22]:
(a==b).all()

True

In [3]:
### CHECK FIRST 12 FEATURES (INTERPRETABLE)
aa = joblib.load('essay_ease10_sbert768_simbow_langerr_780_normalized_asap4')
bb = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
cc = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap4')

In [4]:
aa[:2, :12]

array([[3.15000000e+02, 6.50000000e+01, 3.84745763e+00, 9.15000000e+01,
        4.00000000e+01, 3.10000000e+01, 7.00000000e+00, 6.15384615e-02,
        4.30000000e+01, 6.92777634e-01, 5.98295838e-01, 2.53968254e-02],
       [2.19000000e+02, 4.50000000e+01, 3.40000000e+00, 6.25000000e+01,
        2.90000000e+01, 1.50000000e+01, 6.00000000e+00, 2.22222222e-01,
        2.40000000e+01, 6.89284503e-01, 6.03016216e-01, 4.56621005e-02]])

In [5]:
bb[:2, :12]

array([[337.        ,  67.        ,   4.06557377,  94.        ,
         38.        ,  27.        ,   8.        ,   4.        ,
         45.        ,   0.73137879,   0.45167897,   4.        ],
       [225.        ,  45.        ,   3.57142857,  62.5       ,
         27.        ,  13.        ,   6.        ,  10.        ,
         25.        ,   0.68674046,   0.4659135 ,  11.        ]])

In [6]:
cc[:2, :12]

array([[3.37000000e+02, 6.70000000e+01, 4.06557377e+00, 9.40000000e+01,
        3.80000000e+01, 2.70000000e+01, 8.00000000e+00, 5.97014925e-02,
        4.50000000e+01, 7.31378794e-01, 4.51678968e-01, 1.18694362e-02],
       [2.25000000e+02, 4.50000000e+01, 3.57142857e+00, 6.25000000e+01,
        2.70000000e+01, 1.30000000e+01, 6.00000000e+00, 2.22222222e-01,
        2.50000000e+01, 6.86740458e-01, 4.65913502e-01, 4.88888889e-02]])