-
Notifications
You must be signed in to change notification settings - Fork 1
/
pre_processing.py
282 lines (224 loc) · 9.96 KB
/
pre_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from nltk import pos_tag
def load_data(filename='sample_data/WikiLarge_Train.csv', read_partial=False, s=100000):
"""
if read_partial, then only part of the data is read
s is the length of sample you want to take if read_partial==True
"""
if read_partial:
# n is the total length of the file
n = sum(1 for line in open(filename)) - 1
skip = sorted(random.sample(range(1, n + 1), n - s))
df = pd.read_csv(filename, skiprows=skip)
return df
df = pd.read_csv(filename)
return df
def tokenize(df, lemmatize=True):
"""
some preprocessing before training word2vec
note we do NOT lower case word here
"""
stops = stopwords.words('english')
re_tokenized = []
if not lemmatize:
for text in tqdm(df.original_text):
re_tokenized.append([x for x in re.findall(r'(\w+)', text)
if x.lower() not in stops and len(x) > 1])
pickle.dump(re_tokenized, open('sample_data/re_tokenized_nolemma.pkl', 'wb'))
if lemmatize:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])
for text in tqdm(df.original_text):
text = nlp(text)
re_tokenized.append([token.lemma_ for token in text
if str(token).lower() not in stops and len(str(token)) > 1
and not str(token.lemma_).startswith('-')])
# save for later import
pickle.dump(re_tokenized, open('sample_data/re_tokenized_lemma.pkl', 'wb'))
return re_tokenized
def load_external_resources():
"""
load external files for later processing
1. dale_chall.txt
2. AoA_51715_words.csv
3. Concreteness_ratings_Brysbaert_et_al_BRM.txt
"""
# file 1
with open('sample_data/dale_chall.txt', 'r') as f:
dale = f.readlines()
f.close()
dale = [x.strip() for x in dale]
# file 2
aoa_df = pd.read_csv('sample_data/AoA_51715_words.csv', encoding='iso-8859-1')
# aoa = aoa[['Word', 'AoA_Kup_lem']]
# aoa_dict = aoa.set_index('Word').to_dict()['AoA_Kup_lem']
# file 3
with open('sample_data/Concreteness_ratings_Brysbaert_et_al_BRM.txt', 'r') as f:
concrete = f.readlines()
f.close()
concrete_df = pd.DataFrame([x.split('\t') for x in concrete])
concrete_df.columns = concrete_df.iloc[0]
concrete_df = concrete_df.iloc[1:]
concrete_df = concrete_df.drop('Dom_Pos\n', axis=1)
float_cols = ['Bigram', 'Conc.M', 'Conc.SD', 'Unknown', 'Total', 'Percent_known', 'SUBTLEX']
concrete_df[float_cols] = concrete_df[float_cols].astype(float)
return dale, aoa_df, concrete_df
def syllable_count(word):
"""
count number of syllables in a word
"""
word = word.lower()
count = 0
vowels = "aeiouy"
if word[0] in vowels:
count += 1
for index in range(1, len(word)):
if word[index] in vowels and word[index - 1] not in vowels:
count += 1
if word.endswith("e"):
count -= 1
if count == 0:
count += 1
return count
def calc_dale_score(doc_tokens, doc_sent_cnt, dale_list):
"""
doc_tokens is a list of tokens that have been preprocessed (lower case, removed stopwords).
doc_sent_cnt: how many sentences in this document.
dale_list is the criteria of 'simple words' that defined by official study.
return a calculated score to imply the readability of this chunck of text
(pdw: percentage of difficult words
asl: average sentence length in words
asw: average number of syllables per word
ref: https://readabilityformulas.com/new-dale-chall-readability-formula.php)
"""
simple_cnt = len([x for x in doc_tokens if x.lower() in dale_list])
total_cnt = len(doc_tokens) * 1.0
if total_cnt == 0:
return 0, 0
pdw = 1 - simple_cnt / total_cnt
asl = total_cnt / doc_sent_cnt
score = 0.1579 * pdw + 0.0496 * asl
if pdw > 0.05:
score = score + 3.6365
text = ''.join(doc_tokens)
sy_cnt = syllable_count(text)
asw = sy_cnt / total_cnt
# score2 = 206.835 - 1.015*asl-84.6*asw
return (score, asw)
def get_score_from_dict(list_of_tokens, score_dict):
"""
Calculate the average score for a list of cleaned tokens based on different score dictionaries
"""
scores = []
for token in list_of_tokens:
if token.lower() in score_dict:
scores.append(score_dict[token.lower()])
if scores:
return np.nanmean(scores)
else:
return np.nan
def get_avg_word_length(list_of_tokens):
"""
Calculate the average token length for a list of cleaned tokens in a sentence.
"""
lengths = [len(token) for token in list_of_tokens]
if len(lengths) > 0:
return np.nanmean(lengths)
else:
return np.nan
def calculate_scores(df, lemma=False, token_file='re_tokenized_', train_or_test='Train'):
"""
Calculate 11 numerical scores and save each as a csv file.
df has a column named 're_tokened'
"""
dale, aoa_df, concrete_df = load_external_resources()
aoa_dict1 = aoa_df.set_index('Word').to_dict()['AoA_Kup_lem']
concrete_dict1 = concrete_df.set_index('Word').to_dict()['Percent_known']
concrete_dict2 = concrete_df.set_index('Word').to_dict()['Conc.M']
concrete_dict3 = concrete_df.set_index('Word').to_dict()['SUBTLEX']
concrete_dict4 = concrete_df.set_index('Word').to_dict()['Total']
df['sentence_cnt'] = df.original_text.apply(lambda x: len(sent_tokenize(x)))
if 're_tokened' not in df.columns:
try:
if not lemma:
re_tokened = pickle.load(open(f'{token_file}nolemma.pkl', 'rb'))
if lemma:
re_tokened = pickle.load(open(f'{token_file}lemma.pkl', 'rb'))
df['re_tokened'] = re_tokened
except:
if not lemma:
print('tokenizing original text without lemmatization')
df['re_tokened'] = tokenize(df, lemmatize=False, save_as=f'{token_file}nolemma.pkl')
if lemma:
df['re_tokened'] = tokenize(df, lemmatize=True, save_as=f'{token_file}lemma.pkl')
# 9 scores
dale_chall_scores = []
asw_scores = []
avg_word_lens = []
cnt_verbs_all = []
for i in tqdm(range(len(df))):
item = df.iloc[i]
score1, score2 = calc_dale_score(item['re_tokened'], item['sentence_cnt'], dale)
dale_chall_scores.append(score1)
asw_scores.append(score2)
avg_word_lens.append(get_avg_word_length(item['re_tokened']))
# count how many verbs appeared in this doc without removing stopwords
tags = pos_tag(word_tokenize(df['original_text'].iloc[i]))
cnt_verbs = sum([x[-1][0]=='V' for x in tags])
cnt_verbs_all.append(cnt_verbs)
df['dale_chall_score'] = dale_chall_scores
df['syllable_per_word'] = asw_scores
df['concrete_score'] = df['re_tokened'].apply(lambda x: get_score_from_dict(x, concrete_dict1))
df['conc_mean'] = df['re_tokened'].apply(lambda x: get_score_from_dict(x, concrete_dict2))
df['subtlex'] = df['re_tokened'].apply(lambda x: get_score_from_dict(x, concrete_dict3))
df['conc_total'] = df['re_tokened'].apply(lambda x: get_score_from_dict(x, concrete_dict4))
df['average_word_len'] = avg_word_lens
df['aoa'] = df['re_tokened'].apply(lambda x: get_score_from_dict(x, aoa_dict1))
df['verb2'] = cnt_verbs_all
df['word_cnt'] = df['re_tokened'].apply(len)
# ============================================
# save results as csvs for later reloading
df[['dale_chall_score']].to_csv(f'sample_data/WikiLarge_{train_or_test}_dale_chall_score2.csv')
df[['syllable_per_word']].to_csv(f'sample_data/WikiLarge_{train_or_test}_syllable_per_w2.csv')
df[['concrete_score']].to_csv(f'sample_data/WikiLarge_{train_or_test}_concrete_score2.csv')
df[['conc_mean']].to_csv(f'sample_data/WikiLarge_{train_or_test}_conc_mean_score2.csv')
df[['subtlex']].to_csv(f'sample_data/WikiLarge_{train_or_test}_conc_subtlex_score2.csv')
df[['conc_total']].to_csv(f'sample_data/WikiLarge_{train_or_test}_conc_total_scores2.csv')
df[['average_word_len']].to_csv(f'sample_data/WikiLarge_{train_or_test}_avg_word_len2.csv')
df[['aoa']].to_csv(f'sample_data/WikiLarge_{train_or_test}_aoa2.csv')
df[['verb2']].to_csv(f'sample_data/WikiLarge_{train_or_test}_verb_cnts2.csv')
df[['word_cnt']].to_csv(f'sample_data/WikiLarge_{train_or_test}_word_count2.csv')
def load_df_and_features(path, train=True):
"""
Load the calculated features along with the dataset
:param path: file path to where the files are
:param train: whether it's training dataset. If =False, load for testing dataset.
:return: return df with loaded features
"""
import os
import glob
if train:
feature_files = glob.glob("sample_data/WikiLarge_Train_*2.csv")
features = []
for f in feature_files:
temp = pd.read_csv(f, index_col=0)
features.append(temp)
feature_df = pd.concat(features, axis=1)
# fill na with mean
feature_df = feature_df.fillna(feature_df.mean())
df = load_data(filename='WikiLarge_Train.csv', read_partial=False)
feature_df = pd.concat([df, feature_df], axis=1)
chosen_features = ['aoa', 'concrete_score','verb2', 'conc_unknown', 'aoa_perc_known_lem',
'conc_total', 'syllable_per_word', 'conc_mean_score','dale_chall_score', 'conc_subtlex_score']
chosen_cols = ['original_text', 'label']+chosen_features
feature_df = feature_df[chosen_cols]
return feature_df
if __name__ == '__main__':
test = load_df_and_features(path='')
print(test.head())
print(test.columns)