v1.3

# Objective

The objective of this notebook is to:
- calcualte the readability features (rs_)

# Pre-checks

In [14]:
## Check for Google Drive Connectivity
try:
    from google.colab import drive
    drive.mount('/content/drive')
    google_env = True
except:
    print("Not a Goolge Drive Environment.")
    google_env = False

Not a Goolge Drive Environment.


In [15]:
import time
t_start = time.time()

In [16]:
# Use parameters to enable/disable the fastrun

FASTRUN = False
#FASTRUN = True

# Size of the fastrun dataframe
sample_fraction = 0.1
print("==="*40)
print("Fastrun enabled:", FASTRUN)
if FASTRUN: print("Sample size:",sample_fraction)
print("==="*40)


Fastrun enabled: False


# Setup Environment

## Install Modules

In [17]:
#!pip install watermark
#!pip install textstat
#!pip install fastparquet
#!pip install -q language_tool_python spacy  # -U
#!python -m spacy download en_core_web_sm

## Import Modules

In [18]:
# Base libraries
import os
import re
from datetime import date

# Scientific libraries
import numpy as np
import pandas as pd

# Specific libraries
import textstat

# Visualization
import seaborn as sns
sns.set(rc={'figure.figsize': (8, 4)})
sns.set(font_scale=0.8)

# Helper libraries
from tqdm import tqdm
tqdm.pandas()
from watermark import watermark
import gc  # garbage collection to optimize memory usage, use gc.collect()
import warnings
warnings.filterwarnings('ignore')

# Pandas options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

# Load magic commands
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


## Define Parameters

In [19]:
seed = 42
cpu_count = os.cpu_count()
cpu_count = cpu_count-2  # to keep machine responsive when fitting the models
notebook_no = "01.02"
today = date.today()

## Helper Functions

In [20]:
import http.client
import urllib


def send_push(message):
    """Send push notifications to pushover service."""
    try:
        conn = http.client.HTTPSConnection("api.pushover.net:443")
        conn.request("POST", "/1/messages.json",
                     urllib.parse.urlencode({
                         "token": "ahs1q4mwpnxe3645zeaqzas69whq7a",  # ML Notifications Channel
                         "user": "u5vr1qkc9ghudg2ehuug153okeiz1d",
                         "message": message,
                     }), {"Content-type": "application/x-www-form-urlencoded"})

        conn.getresponse()

    except:
        print("There was a communication issue (pushover).")


# Load Data

In [21]:
# Load Data
if google_env:
    # Location for "shared with" people
    # create a shortcut of the shared folder in your Google Drive root folder
    ROOT_PATH = "/content/drive/MyDrive/SIADS696/Environment/"
else:
    ROOT_PATH = "../"
    print("Not a Google Drive Environment. Loading local files.")

PATH_DATA = "data/"
PATH_DATA_RAW = "data/raw/"
PATH_DATA_INT = "data/interim/"
PATH_DATA_PRO = "data/processed/"
PATH_DATA_MOD = "models/"
PATH_DATA_REP = "reports/"
PATH_DATA_FIG = "reports/figures/"
PATH_DATA_HTML = "reports/html/"

df_wiki_train = pd.read_parquet(ROOT_PATH+PATH_DATA_INT + "train_features_clean_stats.parquet.gzip")
df_wiki_test = pd.read_parquet(ROOT_PATH+PATH_DATA_INT + "test_features_clean_stats.parquet.gzip")

Not a Google Drive Environment. Loading local files.


In [22]:
df_wiki_train.shape
df_wiki_test.shape

(416768, 37)

(119092, 38)

# 4.0 Data Cleaning and Feature Engineering

**Tips on Creating Features**
- Linear models learn sums and differences naturally, but can't learn anything more complex.
- Ratios seem to be difficult for most models to learn. Ratio combinations often lead to some easy performance gains.
- Linear models and neural nets generally do better with normalized features. Neural nets especially need features scaled to values not too far from 0. Tree-based models (like random forests and XGBoost) can sometimes benefit from normalization, but usually much less so.
- Tree models can learn to approximate almost any combination of features, but when a combination is especially important they can still benefit from having it explicitly created, especially when data is limited.
- Counts are especially helpful for tree models, since these models don't have a natural way of aggregating information across many features at once.
[Source](https://www.kaggle.com/code/ryanholbrook/creating-features)

### Calculate readability scores (rs)

#### All Languages

In [24]:
rs_algo_all_dict = {
    'rs_automated_readability_index': textstat.automated_readability_index,
    'rs_lix': textstat.lix,
    'rs_rix': textstat.rix,
}

lang = 'en'
textstat.set_lang(lang)

print("==="*15, "Wiki Train", "==="*15)
gc.collect()

for col_name, algo in rs_algo_all_dict.items():
    print(col_name+"_all")
    df_wiki_train[col_name+"_all"] = -1
    df_wiki_train[col_name+"_all"] = df_wiki_train['cleaned_text'].progress_apply(algo) #.round(1)

print("==="*15, "Wiki Test", "==="*15)
gc.collect()
for col_name, algo in rs_algo_all_dict.items():
    print(col_name+"_all")
    df_wiki_test[col_name+"_all"] = -1
    df_wiki_test[col_name+"_all"] = df_wiki_test['cleaned_text'].progress_apply(algo) #.round(1)



3066

rs_automated_readability_index_all


100%|██████████| 416768/416768 [00:02<00:00, 145780.42it/s]


rs_lix_all


100%|██████████| 416768/416768 [00:02<00:00, 140271.28it/s]


rs_rix_all


100%|██████████| 416768/416768 [00:02<00:00, 155959.48it/s]




0

rs_automated_readability_index_all


100%|██████████| 119092/119092 [00:00<00:00, 156223.93it/s]


rs_lix_all


100%|██████████| 119092/119092 [00:00<00:00, 144142.90it/s]


rs_rix_all


100%|██████████| 119092/119092 [00:00<00:00, 164728.07it/s]


#### English Only

In [25]:
# Dictionary with only the english supported readability scores
# Dict: new column name: textstat function
rs_algo_en_dict = {
    'rs_automated_readability_index': textstat.automated_readability_index,
    'rs_flesch_kincaid_grade': textstat.flesch_kincaid_grade,
    'rs_coleman_liau_index': textstat.coleman_liau_index,
    'rs_linsear_write_formula': textstat.linsear_write_formula,
    'rs_dale_chall_readability_score1': textstat.dale_chall_readability_score,
    'rs_dale_chall_readability_score2': textstat.dale_chall_readability_score_v2,
    'rs_gunning_fog': textstat.gunning_fog,
    'rs_difficult_words': textstat.difficult_words,
    'rs_spache_readability': textstat.spache_readability,
    }

In [27]:
print("==="*15, "Wiki Train", "==="*15)
gc.collect()
# For each readability score, create the corresponding column and calculate the score
lang = 'en'
textstat.set_lang(lang)
for col_name, algo in rs_algo_en_dict.items():
    print(col_name+"_"+lang)
    df_wiki_train[col_name+"_"+lang] = -1
    df_wiki_train[col_name+"_"+lang] = np.where(df_wiki_train['stats_language_code'] == lang,
                                       df_wiki_train['cleaned_text'].progress_apply(algo), #.round(1),
                                       df_wiki_train[col_name+"_"+lang])

print("==="*15, "Wiki Test", "==="*15)
gc.collect()
for col_name, algo in rs_algo_en_dict.items():
    print(col_name+"_"+lang)
    df_wiki_test[col_name+"_"+lang] = -1
    df_wiki_test[col_name+"_"+lang] = np.where(df_wiki_test['stats_language_code'] == lang,
                                               df_wiki_test['cleaned_text'].progress_apply(algo), #.round(1),
                                               df_wiki_test[col_name+"_"+lang])



48

rs_automated_readability_index_en


100%|██████████| 416768/416768 [00:02<00:00, 150651.27it/s]


rs_flesch_kincaid_grade_en


100%|██████████| 416768/416768 [00:09<00:00, 42131.12it/s]


rs_coleman_liau_index_en


100%|██████████| 416768/416768 [00:03<00:00, 117601.64it/s]


rs_linsear_write_formula_en


100%|██████████| 416768/416768 [00:08<00:00, 47360.96it/s]


rs_dale_chall_readability_score1_en


100%|██████████| 416768/416768 [00:11<00:00, 37621.71it/s]


rs_dale_chall_readability_score2_en


100%|██████████| 416768/416768 [00:10<00:00, 39302.93it/s]


rs_gunning_fog_en


100%|██████████| 416768/416768 [00:11<00:00, 37779.00it/s]


rs_difficult_words_en


100%|██████████| 416768/416768 [00:08<00:00, 50108.89it/s]


rs_spache_readability_en


100%|██████████| 416768/416768 [00:10<00:00, 39447.11it/s]




0

rs_automated_readability_index_en


100%|██████████| 119092/119092 [00:00<00:00, 157675.93it/s]


rs_flesch_kincaid_grade_en


100%|██████████| 119092/119092 [00:02<00:00, 48985.77it/s]


rs_coleman_liau_index_en


100%|██████████| 119092/119092 [00:00<00:00, 125026.64it/s]


rs_linsear_write_formula_en


100%|██████████| 119092/119092 [00:02<00:00, 58809.97it/s]


rs_dale_chall_readability_score1_en


100%|██████████| 119092/119092 [00:02<00:00, 46867.39it/s]


rs_dale_chall_readability_score2_en


100%|██████████| 119092/119092 [00:02<00:00, 48123.09it/s]


rs_gunning_fog_en


100%|██████████| 119092/119092 [00:02<00:00, 47999.72it/s]


rs_difficult_words_en


100%|██████████| 119092/119092 [00:01<00:00, 66409.16it/s]


rs_spache_readability_en


100%|██████████| 119092/119092 [00:02<00:00, 50907.46it/s]


In [28]:
print('rs_text_standard_en')
print("==="*15, "Wiki Train", "==="*15)
gc.collect()

df_wiki_train['rs_text_standard_en'] = -1
df_wiki_train['rs_text_standard_en'] = np.where(df_wiki_train['stats_language_code'] == 'en',
                                                df_wiki_train['cleaned_text'].progress_apply(lambda x: textstat.text_standard(x, float_output=True)), #.round(1),
                                                df_wiki_train['rs_text_standard_en'])

print("==="*15, "Wiki Test", "==="*15)
gc.collect()
df_wiki_test['rs_text_standard_en'] = -1
df_wiki_test['rs_text_standard_en'] = np.where(df_wiki_test['stats_language_code'] == 'en',
                                               df_wiki_test['cleaned_text'].progress_apply(lambda x: textstat.text_standard(x, float_output=True)), #.round(1),
                                               df_wiki_test['rs_text_standard_en'])

rs_text_standard_en


16

100%|██████████| 416768/416768 [00:32<00:00, 12796.24it/s]




0

100%|██████████| 119092/119092 [00:07<00:00, 14990.52it/s]


#### Multi-language

In [29]:
print('rs_flesch_reading_ease_multi')
print("==="*15, "Wiki Train", "==="*15)
gc.collect()

# rs_flesch_reading_ease supported languages
language_list = ['en', 'de', 'es', 'fr', 'it', 'nl', 'ru']
df_wiki_train['rs_flesch_reading_ease_multi'] = -1

for lang in language_list:
    print("rs_flesch_reading_ease:", lang)
    textstat.set_lang(lang)
    df_wiki_train['rs_flesch_reading_ease_multi'] = np.where(df_wiki_train['stats_language_code'] == lang,
                                                                    df_wiki_train['cleaned_text'].progress_apply(textstat.flesch_reading_ease), #.round(1),
                                                                    df_wiki_train['rs_flesch_reading_ease_multi'])

print("==="*15, "Wiki Test", "==="*15)
gc.collect()
df_wiki_test['rs_flesch_reading_ease_multi'] = -1

for lang in language_list:
    print("rs_flesch_reading_ease:", lang)
    textstat.set_lang(lang)
    df_wiki_test['rs_flesch_reading_ease_multi'] = np.where(df_wiki_test['stats_language_code'] == lang,
                                                            df_wiki_test['cleaned_text'].progress_apply(textstat.flesch_reading_ease), #.round(1),
                                                            df_wiki_test['rs_flesch_reading_ease_multi'])

rs_flesch_reading_ease_multi


16

rs_flesch_reading_ease: en


100%|██████████| 416768/416768 [00:08<00:00, 48998.15it/s]


rs_flesch_reading_ease: de


100%|██████████| 416768/416768 [00:10<00:00, 38788.13it/s]


rs_flesch_reading_ease: es


100%|██████████| 416768/416768 [00:10<00:00, 38714.17it/s]


rs_flesch_reading_ease: fr


100%|██████████| 416768/416768 [00:10<00:00, 40618.25it/s]


rs_flesch_reading_ease: it


100%|██████████| 416768/416768 [00:10<00:00, 40339.97it/s]


rs_flesch_reading_ease: nl


100%|██████████| 416768/416768 [00:10<00:00, 40594.35it/s]


rs_flesch_reading_ease: ru


100%|██████████| 416768/416768 [00:08<00:00, 47182.68it/s]




0

rs_flesch_reading_ease: en


100%|██████████| 119092/119092 [00:02<00:00, 55732.61it/s]


rs_flesch_reading_ease: de


100%|██████████| 119092/119092 [00:02<00:00, 48069.76it/s]


rs_flesch_reading_ease: es


100%|██████████| 119092/119092 [00:02<00:00, 47823.33it/s]


rs_flesch_reading_ease: fr


100%|██████████| 119092/119092 [00:02<00:00, 50525.31it/s]


rs_flesch_reading_ease: it


100%|██████████| 119092/119092 [00:02<00:00, 48981.26it/s]


rs_flesch_reading_ease: nl


100%|██████████| 119092/119092 [00:02<00:00, 48623.29it/s]


rs_flesch_reading_ease: ru


100%|██████████| 119092/119092 [00:02<00:00, 56156.11it/s]


#### Spanish

In [30]:
# Dictionary with only the Spanish supported readability scores
# Dict: new column name: textstat function
rs_algo_es_dict = {
    'rs_fernandez_huerta': textstat.fernandez_huerta,
    'rs_gutierrez_polini': textstat.gutierrez_polini,
    'rs_crawford': textstat.crawford,
    'rs_szigriszt_pazos': textstat.szigriszt_pazos,
}

In [31]:
print("==="*15, "Wiki Train", "==="*15)
gc.collect()

lang = 'es'
textstat.set_lang(lang)

# For each readability score, create the corresponding column and calculate the score
for col_name, algo in rs_algo_es_dict.items():
    print(col_name+"_"+lang)
    df_wiki_train[col_name+"_"+lang] = -1
    df_wiki_train[col_name+"_"+lang] = np.where(df_wiki_train['stats_language_code'] == lang,
                                                df_wiki_train['cleaned_text'].progress_apply(algo), #.round(1),
                                                df_wiki_train[col_name+"_"+lang])

print("==="*15, "Wiki Test", "==="*15)
gc.collect()

for col_name, algo in rs_algo_es_dict.items():
    print(col_name+"_"+lang)
    df_wiki_test[col_name+"_"+lang] = -1
    df_wiki_test[col_name+"_"+lang] = np.where(df_wiki_test['stats_language_code'] == lang,
                                               df_wiki_test['cleaned_text'].progress_apply(algo), #.round(1),
                                               df_wiki_test[col_name+"_"+lang])



33

rs_fernandez_huerta_es


100%|██████████| 416768/416768 [00:08<00:00, 49733.43it/s]


rs_gutierrez_polini_es


100%|██████████| 416768/416768 [00:02<00:00, 149788.05it/s]


rs_crawford_es


100%|██████████| 416768/416768 [00:07<00:00, 55771.56it/s]


rs_szigriszt_pazos_es


100%|██████████| 416768/416768 [00:07<00:00, 52443.08it/s]




0

rs_fernandez_huerta_es


100%|██████████| 119092/119092 [00:02<00:00, 54708.04it/s]


rs_gutierrez_polini_es


100%|██████████| 119092/119092 [00:00<00:00, 162473.66it/s]


rs_crawford_es


100%|██████████| 119092/119092 [00:02<00:00, 57863.67it/s]


rs_szigriszt_pazos_es


100%|██████████| 119092/119092 [00:02<00:00, 56953.15it/s]


#### German

In [32]:
lang = 'de'
textstat.set_lang(lang)
gc.collect()

print("==="*15, "Wiki Train", "==="*15)
print('rs_wiener_sachtextformel_de')
df_wiki_train['rs_wiener_sachtextformel_de'] = -1
df_wiki_train['rs_wiener_sachtextformel_de'] = np.where(df_wiki_train['stats_language_code'] == lang,
                                                df_wiki_train['cleaned_text'].progress_apply(lambda x: textstat.wiener_sachtextformel(x, variant=2)), #.round(1),
                                                df_wiki_train['rs_wiener_sachtextformel_de'])

print('rs_lix_de')
df_wiki_train['rs_lix_de'] = -1
df_wiki_train['rs_lix_de'] = np.where(df_wiki_train['stats_language_code'] == lang,
                                                        df_wiki_train['cleaned_text'].progress_apply(textstat.lix), #.round(1),
                                                        df_wiki_train['rs_lix_de'])


print("==="*15, "Wiki Test", "==="*15)
print('rs_wiener_sachtextformel_de')
df_wiki_test['rs_wiener_sachtextformel_de'] = -1
df_wiki_test['rs_wiener_sachtextformel_de'] = np.where(df_wiki_test['stats_language_code'] == lang,
                                                    df_wiki_test['cleaned_text'].progress_apply(lambda x: textstat.wiener_sachtextformel(x, variant=2)), #.round(1),
                                                    df_wiki_test['rs_wiener_sachtextformel_de'])

print('rs_lix_de')
df_wiki_test['rs_lix_de'] = -1
df_wiki_test['rs_lix_de'] = np.where(df_wiki_test['stats_language_code'] == lang,
                                     df_wiki_test['cleaned_text'].progress_apply(textstat.lix), #.round(1),
                                     df_wiki_test['rs_lix_de'])

0

rs_wiener_sachtextformel_de


100%|██████████| 416768/416768 [00:11<00:00, 35929.13it/s]


rs_lix_de


100%|██████████| 416768/416768 [00:02<00:00, 141498.26it/s]


rs_wiener_sachtextformel_de


100%|██████████| 119092/119092 [00:02<00:00, 44325.05it/s]


rs_lix_de


100%|██████████| 119092/119092 [00:00<00:00, 150310.62it/s]


#### Italian

In [33]:
print('rs_gulpease_index_it')
print("==="*15, "Wiki Train", "==="*15)

gc.collect()
lang = 'it'
textstat.set_lang(lang)

df_wiki_train['rs_gulpease_index_it'] = -1
df_wiki_train['rs_gulpease_index_it'] = np.where(df_wiki_train['stats_language_code'] == lang,
                                                     df_wiki_train['cleaned_text'].progress_apply(lambda x: textstat.wiener_sachtextformel(x, variant=2)), #.round(1),
                                                     df_wiki_train['rs_gulpease_index_it'])

print("==="*15, "Wiki Test", "==="*15)
df_wiki_test['rs_gulpease_index_it'] = -1
df_wiki_test['rs_gulpease_index_it'] = np.where(df_wiki_test['stats_language_code'] == lang,
                                                    df_wiki_test['cleaned_text'].progress_apply(lambda x: textstat.wiener_sachtextformel(x, variant=2)), #.round(1),
                                                    df_wiki_test['rs_gulpease_index_it'])

rs_gulpease_index_it


16

100%|██████████| 416768/416768 [00:11<00:00, 35552.22it/s]




100%|██████████| 119092/119092 [00:02<00:00, 41686.96it/s]


***

In [None]:
# Creating interaction features
def calculate_rs_additional_features(df):
    df['rs_ari_all1'] = df['stats_word_count'] * df['rs_automated_readability_index_all']
    df['rs_ari_all2'] = df['stats_syllable_count'] * df['rs_automated_readability_index_all']

    df['rs_ari_en1'] = df['stats_word_count'] * df['rs_automated_readability_index_en']
    df['rs_ari_en2'] = df['stats_syllable_count'] * df['rs_automated_readability_index_en']

    df['rs_fre_multi1'] = df['stats_word_count'] * df['rs_flesch_reading_ease_multi']
    df['rs_fre_multi2'] = df['stats_syllable_count'] * df['rs_flesch_reading_ease_multi']

    df['rs_lix1'] = df['stats_word_count'] * df['rs_lix_all']
    df['rs_lix2'] = df['stats_syllable_count'] * df['rs_lix_all']

    df['rs_rix1'] = df['stats_word_count'] * df['rs_rix_all']
    df['rs_rix2'] = df['stats_syllable_count'] * df['rs_rix_all']

    #df['rs_flesch1'] = df['stats_word_count'] * df['rs_flesch_reading_ease']
    #df['rs_flesch2'] = df['stats_syllable_count'] * df['rs_flesch_reading_ease']

    #df['rs_fog1'] = df['stats_word_count'] * df['rs_gunning_fog']
    #df['rs_fog2'] = df['stats_syllable_count'] * df['rs_gunning_fog']

    #df['rs_dale1'] = df['stats_word_count'] * df['rs_dale_chall_readability_score']
    #df['rs_dale2'] = df['stats_syllable_count'] * df['rs_dale_chall_readability_score']

    #df['rs_lin1'] = df['stats_word_count'] * df['rs_linsear_write_formula']
    #df['rs_lin2'] = df['stats_syllable_count'] * df['rs_linsear_write_formula']

    #df['rs_col1'] = df['stats_word_count'] * df['rs_coleman_liau_index']
    #df['rs_col2'] = df['stats_syllable_count'] * df['rs_coleman_liau_index']

    #df['rs_diff1'] = df['stats_word_count'] * df['rs_difficult_words']
    #df['rs_diff2'] = df['stats_syllable_count'] * df['rs_difficult_words']

    return df

In [34]:
# TODO: Disable those features in feature iterations of the project, the produce redundant features
df_wiki_train = calculate_rs_additional_features(df_wiki_train)
df_wiki_test = calculate_rs_additional_features(df_wiki_test)

***

In [35]:
df_wiki_train = df_wiki_train.fillna(-1)
df_wiki_test = df_wiki_test.fillna(-1)

In [36]:
df_wiki_train.sample(4)

Unnamed: 0,original_text,cleaned_text,label,stats_lrb_count,stats_rrb_count,stats_comma_count,stats_equalsign_count,stats_char_count,stats_image_description,stats_frac_description,stats_file_description,stats_formula_description,stats_language_code,stats_avg_char_per_word,stats_word_count,stats_letter_count,stats_long_word_count,stats_syllable_count,stats_polysyllab_count,stats_monosyllab_count,stats_reading_time,stats_avg_letter_per_word,stats_sentence_length,stats_syllable_avg,stats_mini_word_count,stats_long_numbers_count,stats_lexical_diversity,stats_frac_word_comma,stats_frac_mini_word,stats_frac_long_word,stats_frac_monosyllable,stats_frac_polysyllable,stats_max_chars_word,stats_avg_chars_word,stats_max_syllables_word,stats_avg_syllables_word,stats_language_no,rs_automated_readability_index_all,rs_lix_all,rs_rix_all,rs_automated_readability_index_en,rs_flesch_kincaid_grade_en,rs_coleman_liau_index_en,rs_linsear_write_formula_en,rs_dale_chall_readability_score1_en,rs_dale_chall_readability_score2_en,rs_gunning_fog_en,rs_difficult_words_en,rs_spache_readability_en,rs_text_standard_en,rs_flesch_reading_ease_multi,rs_fernandez_huerta_es,rs_gutierrez_polini_es,rs_crawford_es,rs_szigriszt_pazos_es,rs_wiener_sachtextformel_de,rs_lix_de,rs_gulpease_index_it,rs_ari_all1,rs_ari_all2,rs_ari_en1,rs_ari_en2,rs_fre_multi1,rs_fre_multi2,rs_lix1,rs_lix2,rs_rix1,rs_rix2
223505,"Two fossils were found in the older -LRB- lower -RRB- portion of the Yixian Formation in China , 128.2 million years ago -LRB- Ma -RRB- during the Barremian age .",two fossils were found in the older lower portion of the yixian formation in china 128 2 million years ago ma during the barremian age,0,2,2,1,0,133,0,0,0,0,en,4.4,25,110,5,33,2,19,1.62,4.4,25.0,1.3,10,0,0.88,0.04,0.4,0.2,0.76,0.08,9,4.4,3,1.32,18,11.8,45.0,5.0,11.8,9.5,8.54,14.5,11.82,7.4,13.2,4,5.74,12.0,71.48,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,295.0,389.4,295.0,389.4,1787.0,2358.84,1125.0,1485.0,125.0,165.0
37054,"Abraham Charles '' Abe '' Vigoda -LRB- born February 24 , 1921 -RRB- is an American movie and television actor .",abraham charles abe vigoda born february 24 1921 is an american movie and television actor,1,1,1,1,0,92,0,0,0,0,en,5.07,15,76,5,25,4,9,1.12,5.07,15.0,1.7,4,0,1.0,0.066667,0.266667,0.333333,0.6,0.266667,10,5.066667,3,1.666667,18,9.9,48.3,5.0,9.9,10.3,11.53,11.5,12.8,8.59,11.33,4,5.247333,12.0,47.79,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,148.5,247.5,148.5,247.5,716.85,1194.75,724.5,1207.5,75.0,125.0
36378,"These are mutually exclusive states so if you define a row with both a header and a label\/data pair , the label\/data pair is ignored .",these are mutually exclusive states so if you define a row with both a header and a label data pair the label data pair is ignored,1,0,0,1,0,111,0,0,0,0,en,4.04,26,105,3,34,2,20,1.54,4.04,26.0,1.3,11,0,0.807692,0.038462,0.423077,0.115385,0.769231,0.076923,9,4.038462,3,1.307692,18,10.6,37.5,3.0,10.6,9.9,6.45,15.0,9.18,7.96,13.48,5,6.158846,10.0,70.47,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,275.6,360.4,275.6,360.4,1832.22,2395.98,975.0,1275.0,78.0,102.0
141846,Paradou -LRB- also : Le Paradou -RRB- is a commune in the Bouches-du-Rh Ã ne department in southern France .,paradou also le paradou is a commune in the bouches du rh ne department in southern france,1,1,1,0,0,89,0,0,0,0,en,4.35,17,74,6,21,1,14,1.09,4.35,17.0,1.2,9,0,0.882353,0.0,0.529412,0.352941,0.823529,0.058824,10,4.352941,3,1.235294,18,7.6,52.3,6.0,7.6,5.2,7.65,8.5,12.84,6.34,9.15,2,4.247765,8.0,88.06,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,129.2,159.6,129.2,159.6,1497.02,1849.26,889.1,1098.3,102.0,126.0


In [37]:
df_wiki_test.sample(4)

Unnamed: 0,id,cleaned_text,original_text,label,stats_lrb_count,stats_rrb_count,stats_comma_count,stats_equalsign_count,stats_char_count,stats_image_description,stats_frac_description,stats_file_description,stats_formula_description,stats_language_code,stats_avg_char_per_word,stats_word_count,stats_letter_count,stats_long_word_count,stats_syllable_count,stats_polysyllab_count,stats_monosyllab_count,stats_reading_time,stats_avg_letter_per_word,stats_sentence_length,stats_syllable_avg,stats_mini_word_count,stats_long_numbers_count,stats_lexical_diversity,stats_frac_word_comma,stats_frac_mini_word,stats_frac_long_word,stats_frac_monosyllable,stats_frac_polysyllable,stats_max_chars_word,stats_avg_chars_word,stats_max_syllables_word,stats_avg_syllables_word,stats_language_no,rs_automated_readability_index_all,rs_lix_all,rs_rix_all,rs_automated_readability_index_en,rs_flesch_kincaid_grade_en,rs_coleman_liau_index_en,rs_linsear_write_formula_en,rs_dale_chall_readability_score1_en,rs_dale_chall_readability_score2_en,rs_gunning_fog_en,rs_difficult_words_en,rs_spache_readability_en,rs_text_standard_en,rs_flesch_reading_ease_multi,rs_fernandez_huerta_es,rs_gutierrez_polini_es,rs_crawford_es,rs_szigriszt_pazos_es,rs_wiener_sachtextformel_de,rs_lix_de,rs_gulpease_index_it,rs_ari_all1,rs_ari_all2,rs_ari_en1,rs_ari_en2,rs_fre_multi1,rs_fre_multi2,rs_lix1,rs_lix2,rs_rix1,rs_rix2
27573,27573,even some japanese citizens see geisha in that way because of the lower class geisha who do sell their bodies and work as prostitutes the biggest misconception of geisha is that they have sex with their customers the most exclusive modern geisha do not,Even some Japanese citizens see geisha in that way because of the lower class geisha who do sell their bodies and work as prostitutes . The biggest misconception of geisha is that they have sex with their customers ; the most exclusive modern geisha do not .,-1.0,0,0,0,0,212,0,0,0,0,en,4.75,44,209,8,59,5,35,3.07,4.75,44.0,1.3,16,0,0.795455,0.0,0.363636,0.181818,0.795455,0.113636,13,4.75,4,1.340909,12,22.9,62.2,8.0,22.9,16.9,11.16,27.0,9.77,8.33,22.15,7,8.411182,10.0,52.2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1007.6,1351.1,1007.6,1351.1,2296.8,3079.8,2736.8,3669.8,352.0,472.0
95113,95113,the independence of pakistan in 1947 led to the divisions of the punjab province into two new provinces,The independence of Pakistan in 1947 led to the divisions of the Punjab province into two new provinces .,-1.0,0,0,0,0,87,0,0,0,0,en,4.78,18,86,5,25,2,14,1.26,4.78,18.0,1.4,10,0,0.833333,0.0,0.555556,0.277778,0.777778,0.111111,12,4.777778,4,1.388889,12,10.1,45.8,5.0,10.1,8.0,10.15,11.0,10.67,7.16,11.64,3,4.810333,11.0,70.13,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,181.8,252.5,181.8,252.5,1262.34,1753.25,824.4,1145.0,90.0,125.0
89294,89294,the canon eos electro optical system autofocus 35 mm film and digital slr camera system was introduced in 1987 with the canon eos 650 and is still in production as canon current dslr system,The Canon EOS -LRB- Electro-Optical System -RRB- autofocus 35 mm film and digital SLR camera system was introduced in 1987 with the Canon EOS 650 and is still in production as Canon 's current dSLR system .,-1.0,1,1,0,0,170,0,0,0,0,en,4.59,34,156,7,53,5,20,2.29,4.59,34.0,1.6,13,0,0.764706,0.0,0.382353,0.205882,0.588235,0.147059,10,4.588235,3,1.558824,12,17.2,54.6,7.0,17.2,16.6,9.93,22.0,13.22,9.97,19.48,10,8.162412,17.0,36.97,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,584.8,911.6,584.8,911.6,1256.98,1959.41,1856.4,2893.8,238.0,371.0
41539,41539,i do not have to acknowledge anyone who lives by welfare denies the legitimacy of the very state that provides that welfare refuses to care for the education of his children and constantly produces new little headscarf girls,"I do not have to acknowledge anyone who lives by welfare , denies the legitimacy of the very state that provides that welfare , refuses to care for the education of his children and constantly produces new little headscarf-girls .",-1.0,0,0,2,0,191,0,0,0,0,en,4.92,38,187,11,53,3,27,2.75,4.92,38.0,1.4,16,0,0.842105,0.052632,0.421053,0.289474,0.710526,0.078947,11,4.921053,4,1.394737,12,20.7,67.0,11.0,20.7,15.8,11.85,22.0,10.09,8.85,18.36,8,8.007526,16.0,49.83,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,786.6,1097.1,786.6,1097.1,1893.54,2640.99,2546.0,3551.0,418.0,583.0


In [38]:
df_wiki_train[df_wiki_train['stats_language_code']!= 'en'].head(10)

Unnamed: 0,original_text,cleaned_text,label,stats_lrb_count,stats_rrb_count,stats_comma_count,stats_equalsign_count,stats_char_count,stats_image_description,stats_frac_description,stats_file_description,stats_formula_description,stats_language_code,stats_avg_char_per_word,stats_word_count,stats_letter_count,stats_long_word_count,stats_syllable_count,stats_polysyllab_count,stats_monosyllab_count,stats_reading_time,stats_avg_letter_per_word,stats_sentence_length,stats_syllable_avg,stats_mini_word_count,stats_long_numbers_count,stats_lexical_diversity,stats_frac_word_comma,stats_frac_mini_word,stats_frac_long_word,stats_frac_monosyllable,stats_frac_polysyllable,stats_max_chars_word,stats_avg_chars_word,stats_max_syllables_word,stats_avg_syllables_word,stats_language_no,rs_automated_readability_index_all,rs_lix_all,rs_rix_all,rs_automated_readability_index_en,rs_flesch_kincaid_grade_en,rs_coleman_liau_index_en,rs_linsear_write_formula_en,rs_dale_chall_readability_score1_en,rs_dale_chall_readability_score2_en,rs_gunning_fog_en,rs_difficult_words_en,rs_spache_readability_en,rs_text_standard_en,rs_flesch_reading_ease_multi,rs_fernandez_huerta_es,rs_gutierrez_polini_es,rs_crawford_es,rs_szigriszt_pazos_es,rs_wiener_sachtextformel_de,rs_lix_de,rs_gulpease_index_it,rs_ari_all1,rs_ari_all2,rs_ari_en1,rs_ari_en2,rs_fre_multi1,rs_fre_multi2,rs_lix1,rs_lix2,rs_rix1,rs_rix2
90,Walter Kogler,walter kogler,1,0,0,0,0,12,0,0,0,0,de,6.0,2,12,0,4,0,0,0.18,6.0,2.0,2.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,6,6.0,2,2.0,16,7.8,2.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,61.0,-1.0,-1.0,-1.0,-1.0,-2.4426,2.0,-1.0,15.6,31.2,-2.0,-4.0,122.0,244.0,4.0,8.0,0.0,0.0
157,ISBN 3-85052-197-4,isbn 3 85052 197 4,1,0,0,0,0,17,0,0,0,0,tt,2.8,5,14,0,5,0,5,0.21,2.8,5.0,1.0,0,1,1.0,0.0,0.0,0.0,1.0,0.0,5,2.8,1,1.0,81,-5.8,5.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-29.0,-29.0,-5.0,-5.0,-5.0,-5.0,25.0,25.0,0.0,0.0
309,Broadcast :,broadcast,1,0,0,0,0,10,0,0,0,0,fr,9.0,1,9,1,2,0,0,0.13,9.0,1.0,2.0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,9,9.0,2,2.0,25,21.5,101.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,58.79,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,21.5,43.0,-1.0,-2.0,58.79,117.58,101.0,202.0,1.0,2.0
336,"eta _ -LCB- th -RCB- = 1 - frac -LCB- r ^ -LCB- 1 - gamma -RCB- -LRB- r_c ^ gamma - 1 -RRB- -RCB- -LCB- gamma -LRB- r_c - 1 -RRB- -RCB- ,",eta th 1 r 1 gamma r c gamma 1 gamma r c 1,1,2,2,1,1,104,0,0,0,0,ja,2.07,14,29,0,14,0,14,0.43,2.07,14.0,1.0,7,0,0.428571,0.071429,0.5,0.0,1.0,0.0,5,2.071429,1,1.0,42,-4.8,14.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-67.2,-67.2,-14.0,-14.0,-14.0,-14.0,196.0,196.0,0.0,0.0
445,-LRB- x + y -RRB- ^ n & = -LCB- n choose 0 -RCB- x ^ n y ^ 0 + -LCB- n choose 1 -RCB- x ^ -LCB- n-1 -RCB- y ^ 1 + -LCB- n choose 2 -RCB- x ^ -LCB- n-2 -RCB- y ^ 2 + -LCB- n choose 3 -RCB- x ^ -LCB- n-3 -RCB- y ^ 3 + cdots,x y n n choose 0 x n y 0 n choose 1 x n 1 y 1 n choose 2 x n 2 y 2 n choose 3 x n 3 y 3 cdots,1,1,1,0,1,158,0,0,0,0,es,1.69,35,59,0,35,0,35,0.87,1.69,35.0,1.0,19,0,0.257143,0.0,0.542857,0.0,1.0,0.0,6,1.685714,1,1.0,20,4.0,35.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,109.4,111.2,66.6,1.0,107.76,-1.0,-1.0,-1.0,140.0,140.0,-35.0,-35.0,3829.0,3829.0,1225.0,1225.0,0.0,0.0
594,Cugy,cugy,1,0,0,0,0,4,0,0,0,0,hr,4.0,1,4,0,1,0,1,0.06,4.0,1.0,1.0,0,0,1.0,0.0,0.0,0.0,1.0,0.0,4,4.0,1,1.0,31,-2.2,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-2.2,-2.2,-1.0,-1.0,-1.0,-1.0,1.0,1.0,0.0,0.0
607,! '',,1,0,0,0,0,3,0,0,0,0,oc,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,-1.0,0.0,0.0,0.0,0.0,0.0,-1,-1.0,-1,-1.0,61,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0
720,__ NOTOC __,notoc,1,0,0,0,0,9,0,0,0,0,es,5.0,1,5,0,2,0,0,0.07,5.0,1.0,2.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,5,5.0,2,2.0,20,2.6,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,85.82,85.8,46.35,-14.2,81.24,-1.0,-1.0,-1.0,2.6,5.2,-1.0,-2.0,85.82,171.64,1.0,2.0,0.0,0.0
737,Henri Legarda,henri legarda,1,0,0,0,0,12,0,0,0,0,fr,6.0,2,12,1,3,0,1,0.18,6.0,2.0,1.5,0,0,1.0,0.0,0.0,0.5,0.5,0.0,7,6.0,2,1.5,25,7.8,52.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,20.97,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,15.6,23.4,-2.0,-3.0,41.94,62.91,104.0,156.0,2.0,3.0
830,__ NOTOC __,notoc,1,0,0,0,0,9,0,0,0,0,es,5.0,1,5,0,2,0,0,0.07,5.0,1.0,2.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,5,5.0,2,2.0,20,2.6,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,85.82,85.8,46.35,-14.2,81.24,-1.0,-1.0,-1.0,2.6,5.2,-1.0,-2.0,85.82,171.64,1.0,2.0,0.0,0.0


# 5.0 Export

In [39]:
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith("rs_")])

print(len(feature_columns))
print(feature_columns)

['rs_ari_all1',
 'rs_ari_all2',
 'rs_ari_en1',
 'rs_ari_en2',
 'rs_automated_readability_index_all',
 'rs_automated_readability_index_en',
 'rs_coleman_liau_index_en',
 'rs_crawford_es',
 'rs_dale_chall_readability_score1_en',
 'rs_dale_chall_readability_score2_en',
 'rs_difficult_words_en',
 'rs_fernandez_huerta_es',
 'rs_flesch_kincaid_grade_en',
 'rs_flesch_reading_ease_multi',
 'rs_fre_multi1',
 'rs_fre_multi2',
 'rs_gulpease_index_it',
 'rs_gunning_fog_en',
 'rs_gutierrez_polini_es',
 'rs_linsear_write_formula_en',
 'rs_lix1',
 'rs_lix2',
 'rs_lix_all',
 'rs_lix_de',
 'rs_rix1',
 'rs_rix2',
 'rs_rix_all',
 'rs_spache_readability_en',
 'rs_szigriszt_pazos_es',
 'rs_text_standard_en',
 'rs_wiener_sachtextformel_de']

In [40]:
# Export rs_ features
df_export = df_wiki_train[feature_columns]
df_export.to_csv(ROOT_PATH+PATH_DATA_INT + "train_features_rs.csv", index=False)
df_export.to_parquet(ROOT_PATH+PATH_DATA_INT + 'train_features_rs.parquet.gzip', compression='gzip')

df_export = df_wiki_test[feature_columns]
df_export.to_csv(ROOT_PATH+PATH_DATA_INT + "test_features_rs.csv", index=False)
df_export.to_parquet(ROOT_PATH+PATH_DATA_INT +'test_features_rs.parquet.gzip', compression='gzip')

In [41]:
t_end = time.time()
total_runtime = t_end - t_start
total_runtime_min = round((total_runtime / 60), 2)
print(str(total_runtime_min) + " minutes")

6.38 minutes


In [42]:
if not FASTRUN: send_push(f"Feature Engineering: Calculate RS features finished in: {total_runtime_min} min.")

# 6.0 Watermark

In [43]:
%watermark

Last updated: 2023-02-20T15:35:16.699097+01:00

Python implementation: CPython
Python version       : 3.9.0
IPython version      : 8.9.0

Compiler    : Clang 11.0.0 
OS          : Darwin
Release     : 22.3.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit



In [44]:
%watermark --iversions

textstat: 0.7.2
re      : 2.2.1
seaborn : 0.12.2
pandas  : 1.5.3
numpy   : 1.23.5



In [45]:
output_file = f'{ROOT_PATH}{PATH_DATA_HTML}{today}_01.02_calculate_readability_score_features.html'
!jupyter nbconvert --to html "01.02_calculate_readability_score_features.ipynb" --output {output_file}

[NbConvertApp] Converting notebook 01.02_calculate_readability_score_features_GC.ipynb to html
[NbConvertApp] Writing 745188 bytes to ../reports/html/2023-02-20_01.02_calculate_readability_score_features_GC.html
