## This file is for creating normalized version of the 780-dim feature vector

#### (see below after the paraphrase part)

In [1]:
import joblib
import numpy as np

### Preprocess paraphrase vector to be 780-dim

In [81]:
x_ease = joblib.load('essay_ease_spelling_unique_416_paraphrase')
x_sbert = joblib.load('s-bert_asap6_paraphrased_mean_paragraph')
x_sbert = np.array(x_sbert)
bow_similarity = joblib.load('bow_similarity_asap6_paraphrase')
lang_error = joblib.load('errors_asap6_paraphrase').reshape(-1,1)
sbert_similarity = joblib.load('sbert_similarity_paraphrase')

In [82]:
x_12 = x_ease[:, :12]
x_12.shape

(1800, 12)

In [83]:
x_bag = x_ease[:, 12:]
x_bag.shape

(1800, 404)

In [84]:
x_spell_error = x_ease[:, -2].reshape(-1,1)
x_unique_words = x_ease[:, -1].reshape(-1,1)

In [85]:
# columns 2,3,4 --> comma counts, Apostrophe counts, other punctuation counts
# we sum all of the punctuation counts
x_punc_counts = np.sum(x_12[:, [2,3,4]], axis=1)
x_punc_counts = x_punc_counts.reshape(-1, 1)

In [86]:
# X_ease_nobag
x_6 = np.delete(x_12, [2,3,4,7,9,11], axis=1)
print(x_6.shape)
x_6[:3]

(1800, 6)


array([[ 782.        ,  138.        ,    4.79230769,  200.5       ,
          85.        ,   51.        ],
       [1155.        ,  206.        ,    4.81382979,  302.5       ,
         136.        ,   65.        ],
       [ 993.        ,  183.        ,    4.56976744,  266.        ,
         129.        ,   82.        ]])

In [87]:
x_10 = np.concatenate((x_6, x_punc_counts, x_spell_error, x_unique_words, sbert_similarity), axis=1)
x_10.shape

(1800, 10)

In [88]:
x = np.concatenate((x_10, bow_similarity, lang_error, x_sbert), axis=1)
x.shape

(1800, 780)

### save

In [89]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_paraphrase')

['essay_ease10_sbert768_simbow_langerr_780_paraphrase']

### load file to normalize

In [38]:
x = joblib.load('essay_ease10_sbert768_simbow_langerr_780_asap7')
x.shape

(1569, 780)

## Normalize some features

### 1. Spelling error / word count

In [39]:
spell_error = x[:,7]
word_count = x[:,1]
spell_error_normalized = spell_error / word_count
spell_error_normalized

array([0.06060606, 0.0952381 , 0.05454545, ..., 0.02061856, 0.02793296,
       0.0106383 ])

In [40]:
x[:,7] = spell_error_normalized

### 2. Language error / answer length

In [41]:
lang_error = x[:,11]
answer_length = x[:,0]
print(lang_error)
lang_error_normalized = lang_error / answer_length
print(lang_error_normalized)

[24. 11. 22. ...  8. 31.  2.]
[0.04828974 0.02350427 0.02879581 ... 0.00901917 0.02018229 0.00261438]


In [42]:
x[:,11] = lang_error_normalized

### 3. Punctuation count / answer length

In [None]:
punc_count = x[:,6]
answer_length = x[:,0]
print(punc_count)
punc_count_normalized = punc_count / answer_length
print(punc_count_normalized)

In [None]:
x[:,6] = punc_count_normalized

### 4. Good n-gram / answer length

In [None]:
good_ngram = x[:,3]
answer_length = x[:,0]
print(good_ngram)
good_ngram_normalized = good_ngram / answer_length
print(good_ngram_normalized)

In [None]:
x[:,3] = good_ngram_normalized

### save

In [43]:
joblib.dump(x, 'essay_ease10_sbert768_simbow_langerr_780_normalized_asap7')

['essay_ease10_sbert768_simbow_langerr_780_normalized_asap7']

### check

In [95]:
a = x
a.shape

(1800, 780)

In [96]:
b = joblib.load('essay_ease10_sbert768_simbow_langerr_780_paraphrase_normalized')

In [97]:
a == b

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [98]:
type(a)

numpy.ndarray

In [99]:
(a==b).all()

True