In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import time
t_start = time.time()

# 1.0 Documentation

## History

| Date | Version | Author | Comments |
|:-----|:-------:|:-------|:---------|
|2023-01-27 | 1.0 | Andre Buser | - Initial version |


## Objective

- The objective of this **01.05** notebook is to 
  - calculate the CRB features

# 2.0 Setup Environment

## Install Modules

In [3]:
!pip install watermark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting watermark
  Downloading watermark-2.3.1-py2.py3-none-any.whl (7.2 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, watermark
Successfully installed jedi-0.18.2 watermark-2.3.1


## Import Modules

In [4]:
# Base libraries
import os
import re

# Scientific libraries
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
sns.set(rc={'figure.figsize':(8,4)})
sns.set(font_scale=0.8)

# Helper libraries
from tqdm import tqdm
tqdm.pandas()
from watermark import watermark
import gc # garbage collection to optimize memory usage, use gc.collect()
import warnings
warnings.filterwarnings('ignore')

# Pandas options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Load magic commands
%load_ext watermark

## Define Parameters

In [5]:
# None

### Helper Functions

In [6]:
import http.client, urllib

def send_push(message):
	"""Send push notifications to pushover service."""
	conn = http.client.HTTPSConnection("api.pushover.net:443")
	conn.request("POST", "/1/messages.json",
	urllib.parse.urlencode({
		"token": "ahs1q4mwpnxe3645zeaqzas69whq7a",  # ML Notifications Channel
		"user": "u5vr1qkc9ghudg2ehuug153okeiz1d",
		"message": message,
	}), { "Content-type": "application/x-www-form-urlencoded" })

	conn.getresponse()

# 3.0 Load Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Paths: Google Drive Setup
try: 
  # Original Google Drive location (owner)
  PATH_DATA = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/"
  PATH_DATA_RAW = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/raw/"
  PATH_DATA_INT = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/interim/"
  PATH_DATA_PRO = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/processed/"
  PATH_REP = "/content/drive/MyDrive/MADS/SIADS696/Environment/reports/"
  PATH_FIGS = "/content/drive/MyDrive/MADS/SIADS696/Environment/reports/figures/"

  df_wiki_train = pd.read_pickle(PATH_DATA_INT+"train_features_clean_stats.pkl")
  df_wiki_test = pd.read_pickle(PATH_DATA_INT+"test_features_clean_stats.pkl")
  df_crb = pd.read_csv(PATH_DATA_RAW+"Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep='\t')


except:
  # Location for "shared with" people
  # create a shortcut of the shared folder in your Google Drive root folder
  print("Using shortcut location to load data.")
  PATH_DATA = "/content/drive/MyDrive/SIADS696/Environment/data/"
  PATH_DATA_RAW = "/content/drive/MyDrive/SIADS696/Environment/data/raw/"
  PATH_DATA_INT = "/content/drive/MyDrive/SIADS696/Environment/data/interim/"
  PATH_DATA_PRO = "/content/drive/MyDrive/SIADS696/Environment/data/processed/"
  PATH_REP = "/content/drive/MyDrive/SIADS696/Environment/reports/"
  PATH_FIGS = "/content/drive/MyDrive/SIADS696/Environment/reports/figures/"

  df_wiki_train = pd.read_pickle(PATH_DATA_INT+"train_features_clean_stats.pkl")
  df_wiki_test = pd.read_pickle(PATH_DATA_INT+"test_features_clean_stats.pkl")
  df_crb = pd.read_csv(PATH_DATA_RAW+"Concreteness_ratings_Brysbaert_et_al_BRM.txt", sep='\t')

In [9]:
df_crb.shape

(39954, 9)

In [10]:
df_wiki_train.shape

(416768, 26)

In [11]:
df_wiki_test.shape

(119092, 27)

# 4.0 Data Cleaning and Feature Engineering

**Tips on Creating Features**
- Linear models learn sums and differences naturally, but can't learn anything more complex.
- Ratios seem to be difficult for most models to learn. Ratio combinations often lead to some easy performance gains.
- Linear models and neural nets generally do better with normalized features. Neural nets especially need features scaled to values not too far from 0. Tree-based models (like random forests and XGBoost) can sometimes benefit from normalization, but usually much less so.
- Tree models can learn to approximate almost any combination of features, but when a combination is especially important they can still benefit from having it explicitly created, especially when data is limited.
- Counts are especially helpful for tree models, since these models don't have a natural way of aggregating information across many features at once.
[Source](https://www.kaggle.com/code/ryanholbrook/creating-features)

### Calculate CRB Features (crb_)

In [12]:
df_crb['Word'] = df_crb['Word'].str.lower().str.strip()

In [13]:
df_crb.sample(5)

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
10505,condiment,0,4.72,0.59,0,29,1.0,8,Noun
5527,dissociative,0,1.71,1.0,3,27,0.89,29,Adjective
31383,bonelike,0,3.77,1.14,1,27,0.96,0,
26879,misplace,0,2.52,1.19,0,27,1.0,23,Verb
19367,contamination,0,2.7,1.38,0,27,1.0,60,Noun


In [14]:
def calculate_crb_feature(text, column, df_crb, calculation):
    word_list = text.split()  # create word list

    # Old lemma approach; now checking both AoA columns Word and Alternative Spelling to get the target df
    #word_list = [lemmatizer.lemmatize(x) for x in word_list]  # create lemma word

    # subset crb dataframe to get only the words (and values)
    df_temp = df_crb[df_crb['Word'].isin(word_list)]
    
    if calculation == 'sum':
        result = df_temp[column].sum()
        if result <= 0:
            return -1  # returning -1 instead 0, because 0 means no matched words
        else:
          try:
            return result.round(2)
          except:
            print("Failed to return SUM", word_list)
            return -1
            
    elif calculation == 'mean':
        result = df_temp[column].mean() # cannot convert to integer here, because of NAN returns: for no matched words
        #print(result.round())
        if np.isnan(result):
            return -1
        else:
          try:
            return result.round(2)
          except:
            print("Failed to return MEAN", word_list)
            return -1

In [15]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_concm_sum'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.M', df_crb, 'sum'))

df_wiki_test['crb_concm_sum'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.M', df_crb, 'sum'))

100%|██████████| 416768/416768 [22:13<00:00, 312.57it/s]
100%|██████████| 119092/119092 [06:22<00:00, 310.96it/s]

CPU times: user 28min 10s, sys: 18.3 s, total: 28min 28s
Wall time: 28min 36s





In [None]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_concm_mean'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.M', df_crb, 'mean'))

df_wiki_test['crb_concm_mean'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.M', df_crb, 'mean'))

100%|██████████| 416768/416768 [22:14<00:00, 312.37it/s]
100%|██████████| 119092/119092 [05:59<00:00, 331.62it/s]

CPU times: user 27min 56s, sys: 16.8 s, total: 28min 13s
Wall time: 28min 13s





In [None]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_concsd_sum'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.SD', df_crb, 'sum'))

df_wiki_test['crb_concsd_sum'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.SD', df_crb, 'sum'))

100%|██████████| 416768/416768 [21:18<00:00, 326.08it/s]
100%|██████████| 119092/119092 [05:53<00:00, 336.76it/s]

CPU times: user 27min 3s, sys: 15.9 s, total: 27min 19s
Wall time: 27min 11s





In [None]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_concsd_mean'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.SD', df_crb, 'mean'))

df_wiki_test['crb_concsd_mean'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Conc.SD', df_crb, 'mean'))

100%|██████████| 416768/416768 [21:33<00:00, 322.08it/s]
100%|██████████| 119092/119092 [06:18<00:00, 314.41it/s]

CPU times: user 27min 41s, sys: 16.8 s, total: 27min 58s
Wall time: 27min 52s





In [None]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_perc_known_sum'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Percent_known', df_crb, 'sum'))

df_wiki_test['crb_perc_known_sum'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Percent_known', df_crb, 'sum'))

100%|██████████| 416768/416768 [23:47<00:00, 291.87it/s]
100%|██████████| 119092/119092 [05:58<00:00, 332.58it/s]

CPU times: user 29min 16s, sys: 18.7 s, total: 29min 35s
Wall time: 29min 46s





In [None]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_perc_known_mean'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Percent_known', df_crb, 'mean'))

df_wiki_test['crb_perc_known_mean'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'Percent_known', df_crb, 'mean'))

100%|██████████| 416768/416768 [20:50<00:00, 333.40it/s]
100%|██████████| 119092/119092 [06:05<00:00, 325.98it/s]

CPU times: user 26min 47s, sys: 15.7 s, total: 27min 3s
Wall time: 26min 55s





In [None]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_subtlex_sum'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'SUBTLEX', df_crb, 'sum'))

df_wiki_test['crb_subtlex_sum'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'SUBTLEX', df_crb, 'sum'))

100%|██████████| 416768/416768 [21:37<00:00, 321.26it/s]
100%|██████████| 119092/119092 [05:59<00:00, 330.81it/s]

CPU times: user 27min 28s, sys: 16 s, total: 27min 44s
Wall time: 27min 37s





In [None]:
%%time
# MBPM1: 11 min

df_wiki_train['crb_subtlex_mean'] = df_wiki_train['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'SUBTLEX', df_crb, 'mean'))

df_wiki_test['crb_subtlex_mean'] = df_wiki_test['cleaned_text'].progress_apply(
    lambda x: calculate_crb_feature(x, 'SUBTLEX', df_crb, 'mean'))

In [None]:
df_wiki_train.sample(4).T

In [None]:
df_wiki_test.sample(4).T

# 5.0 Export

In [None]:
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith("crb_")])

feature_columns

In [None]:
# Export AoA features
df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+"train_features_crb.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+"test_features_crb.csv", index=False)

# 6.0 Watermark

In [None]:
%watermark

In [None]:
%watermark --iversions

In [None]:
t_end = time.time()
total_runtime = t_end-t_start
total_runtime_min = round((total_runtime/60),2)
print(str(total_runtime_min)+" minutes")

In [None]:
send_push(f"01.05 Feature Engingeering: Calculate CRB features finished in: {total_runtime_min} min.")