## This file is for creating normalized version of the 780-dim feature vector

#### (see below after the paraphrase part)

In [1]:
import joblib
import numpy as np

### Preprocess paraphrase vector to be 780-dim

In [3]:
x_ease = joblib.load('essay_ease_spelling_unique_416_paraphrase')
x_sbert = joblib.load('s-bert_asap1_paraphrased_mean_paragraph')
x_sbert = np.array(x_sbert)
bow_similarity = joblib.load('bow_similarity_asap1_paraphrase')
lang_error = joblib.load('errors_asap1_paraphrase').reshape(-1,1)
sbert_similarity = joblib.load('sbert_similarity_paraphrase')

In [4]:
x_12 = x_ease[:, :12]
x_12.shape

(1783, 12)

In [5]:
x_bag = x_ease[:, 12:]
x_bag.shape

(1783, 404)

In [6]:
x_spell_error = x_ease[:, -2].reshape(-1,1)
x_unique_words = x_ease[:, -1].reshape(-1,1)

In [7]:
# columns 2,3,4 --> comma counts, Apostrophe counts, other punctuation counts
# we sum all of the punctuation counts
x_punc_counts = np.sum(x_12[:, [2,3,4]], axis=1)
x_punc_counts = x_punc_counts.reshape(-1, 1)

In [8]:
# X_ease_nobag
x_6 = np.delete(x_12, [2,3,4,7,9,11], axis=1)
print(x_6.shape)
x_6[:3]

(1783, 6)


array([[2119.        ,  417.        ,    4.34166667,  615.5       ,
         133.        ,   71.        ],
       [2488.        ,  493.        ,    4.13013699,  730.        ,
         149.        ,   83.        ],
       [1694.        ,  321.        ,    4.48797251,  474.        ,
         114.        ,   68.        ]])

In [9]:
x_10 = np.concatenate((x_6, x_punc_counts, x_spell_error, x_unique_words, sbert_similarity), axis=1)
x_10.shape

(1783, 10)

In [10]:
x = np.concatenate((x_10, bow_similarity, lang_error, x_sbert), axis=1)
x.shape

(1783, 780)

### save

In [11]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase']

### load file to normalize

In [12]:
x = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
x.shape

(1783, 780)

## Normalize some features

### 1. Spelling error / word count

In [13]:
spell_error = x[:,7]
word_count = x[:,1]
spell_error_normalized = spell_error / word_count
spell_error_normalized

array([0.05275779, 0.07099391, 0.0529595 , ..., 0.07100592, 0.14285714,
       0.08243728])

In [14]:
x[:,7] = spell_error_normalized

### 2. Language error / answer length

In [15]:
lang_error = x[:,11]
answer_length = x[:,0]
print(lang_error)
lang_error_normalized = lang_error / answer_length
print(lang_error_normalized)

[16. 21. 13. ...  9.  2. 17.]
[0.00755073 0.00844051 0.00767414 ... 0.00491803 0.02247191 0.01320901]


In [16]:
x[:,11] = lang_error_normalized

### 3. Punctuation count / answer length

In [None]:
punc_count = x[:,6]
answer_length = x[:,0]
print(punc_count)
punc_count_normalized = punc_count / answer_length
print(punc_count_normalized)

In [None]:
x[:,6] = punc_count_normalized

### 4. Good n-gram / answer length

In [None]:
good_ngram = x[:,3]
answer_length = x[:,0]
print(good_ngram)
good_ngram_normalized = good_ngram / answer_length
print(good_ngram_normalized)

In [None]:
x[:,3] = good_ngram_normalized

### save

In [22]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap1')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap1']

### check

In [23]:
a = x
a.shape

(1783, 780)

In [24]:
b = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap1')

In [25]:
a == b

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [26]:
type(a)

numpy.ndarray

In [27]:
(a==b).all()

True

In [3]:
### CHECK FIRST 12 FEATURES (INTERPRETABLE)
aa = joblib.load('essay_ease10_sbert768_simbow_langerr_780_normalized_asap1')
bb = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase')
cc = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized_asap1')

In [4]:
aa[:2,:12]

array([[1.91200000e+03, 3.92000000e+02, 4.06628242e+00, 5.78000000e+02,
        1.55000000e+02, 8.90000000e+01, 4.30000000e+01, 3.06122449e-02,
        1.54000000e+02, 8.17658544e-01, 5.65991405e-01, 7.84518828e-03],
       [2.31300000e+03, 4.60000000e+02, 4.05463183e+00, 6.80000000e+02,
        1.57000000e+02, 8.50000000e+01, 3.90000000e+01, 4.34782609e-02,
        1.77000000e+02, 8.55801344e-01, 5.02907054e-01, 1.03761349e-02]])

In [5]:
bb[:2,:12]

array([[2.11900000e+03, 4.17000000e+02, 4.34166667e+00, 6.15500000e+02,
        1.33000000e+02, 7.10000000e+01, 4.40000000e+01, 2.20000000e+01,
        1.80000000e+02, 7.89585650e-01, 4.21219857e-01, 1.60000000e+01],
       [2.48800000e+03, 4.93000000e+02, 4.13013699e+00, 7.30000000e+02,
        1.49000000e+02, 8.30000000e+01, 4.00000000e+01, 3.50000000e+01,
        1.99000000e+02, 8.23750138e-01, 2.50489638e-01, 2.10000000e+01]])

In [6]:
cc[:2,:12]

array([[2.11900000e+03, 4.17000000e+02, 4.34166667e+00, 6.15500000e+02,
        1.33000000e+02, 7.10000000e+01, 4.40000000e+01, 5.27577938e-02,
        1.80000000e+02, 7.89585650e-01, 4.21219857e-01, 7.55073148e-03],
       [2.48800000e+03, 4.93000000e+02, 4.13013699e+00, 7.30000000e+02,
        1.49000000e+02, 8.30000000e+01, 4.00000000e+01, 7.09939148e-02,
        1.99000000e+02, 8.23750138e-01, 2.50489638e-01, 8.44051447e-03]])