v1.1

## Objective

- The objective of this **01.04** notebook is to 
  - calculate the AoA features

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import time
t_start = time.time()

# Setup Environment

## Install Modules

In [3]:
!pip install watermark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting watermark
  Downloading watermark-2.3.1-py2.py3-none-any.whl (7.2 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, watermark
Successfully installed jedi-0.18.2 watermark-2.3.1


## Import Modules

In [4]:
# Base libraries
import os
import re
from datetime import date

# Scientific libraries
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
sns.set(rc={'figure.figsize':(8,4)})
sns.set(font_scale=0.8)

# Helper libraries
from tqdm import tqdm
tqdm.pandas()
from watermark import watermark
import gc # garbage collection to optimize memory usage, use gc.collect()
import warnings
warnings.filterwarnings('ignore')

# Pandas options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Load magic commands
%load_ext watermark

## Define Parameters

In [5]:
today = date.today()

### Helper Functions

In [6]:
import http.client
import urllib


def send_push(message):
    """Send push notifications to pushover service."""
    try:
        conn = http.client.HTTPSConnection("api.pushover.net:443")
        conn.request("POST", "/1/messages.json",
                     urllib.parse.urlencode({
                         "token": "ahs1q4mwpnxe3645zeaqzas69whq7a",  # ML Notifications Channel
                         "user": "u5vr1qkc9ghudg2ehuug153okeiz1d",
                         "message": message,
                     }), {"Content-type": "application/x-www-form-urlencoded"})

        conn.getresponse()

    except:
        print("There was a communication issue (pushover).")


# Load Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Paths: Google Drive Setup
try: 
  # Original Google Drive location (owner)
  PATH_DATA = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/"
  PATH_DATA_RAW = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/raw/"
  PATH_DATA_INT = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/interim/"
  PATH_DATA_PRO = "/content/drive/MyDrive/MADS/SIADS696/Environment/data/processed/"
  PATH_REP = "/content/drive/MyDrive/MADS/SIADS696/Environment/reports/"
  PATH_FIGS = "/content/drive/MyDrive/MADS/SIADS696/Environment/reports/figures/"

  df_wiki_train = pd.read_pickle(PATH_DATA_INT+"train_features_clean_stats.pkl")
  df_wiki_test = pd.read_pickle(PATH_DATA_INT+"test_features_clean_stats.pkl")
  df_aoa = pd.read_csv(PATH_DATA_RAW+"AoA_51715_words.csv", encoding = "ISO-8859-1")

except:
  # Location for "shared with" people
  # create a shortcut of the shared folder in your Google Drive root folder
  print("Using shortcut location to load data.")
  PATH_DATA = "/content/drive/MyDrive/SIADS696/Environment/data/"
  PATH_DATA_RAW = "/content/drive/MyDrive/SIADS696/Environment/data/raw/"
  PATH_DATA_INT = "/content/drive/MyDrive/SIADS696/Environment/data/interim/"
  PATH_DATA_PRO = "/content/drive/MyDrive/SIADS696/Environment/data/processed/"
  PATH_REP = "/content/drive/MyDrive/SIADS696/Environment/reports/"
  PATH_FIGS = "/content/drive/MyDrive/SIADS696/Environment/reports/figures/"

  df_wiki_train = pd.read_pickle(PATH_DATA_INT+"train_features_clean_stats.pkl")
  df_wiki_test = pd.read_pickle(PATH_DATA_INT+"test_features_clean_stats.pkl")
  df_aoa = pd.read_csv(PATH_DATA_RAW+"AoA_51715_words.csv", encoding = "ISO-8859-1")

In [9]:
df_wiki_train.shape
df_wiki_test.shape

(119092, 27)

In [10]:
df_aoa.head()
df_aoa.info()
df_aoa.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51715 entries, 0 to 51714
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Word                  51714 non-null  object 
 1   Alternative.spelling  51714 non-null  object 
 2   Freq_pm               51046 non-null  float64
 3   Dom_PoS_SUBTLEX       50963 non-null  object 
 4   Nletters              51715 non-null  int64  
 5   Nphon                 51715 non-null  int64  
 6   Nsyll                 51715 non-null  int64  
 7   Lemma_highest_PoS     51714 non-null  object 
 8   AoA_Kup               31105 non-null  float64
 9   Perc_known            31124 non-null  float64
 10  AoA_Kup_lem           51695 non-null  float64
 11  Perc_known_lem        51715 non-null  float64
 12  AoA_Bird_lem          5421 non-null   float64
 13  AoA_Bristol_lem       8061 non-null   float64
 14  AoA_Cort_lem          8045 non-null   float64
 15  AoA_Schock         

(51715, 16)

# Data Cleaning and Feature Engineering

**Tips on Creating Features**
- Linear models learn sums and differences naturally, but can't learn anything more complex.
- Ratios seem to be difficult for most models to learn. Ratio combinations often lead to some easy performance gains.
- Linear models and neural nets generally do better with normalized features. Neural nets especially need features scaled to values not too far from 0. Tree-based models (like random forests and XGBoost) can sometimes benefit from normalization, but usually much less so.
- Tree models can learn to approximate almost any combination of features, but when a combination is especially important they can still benefit from having it explicitly created, especially when data is limited.
- Counts are especially helpful for tree models, since these models don't have a natural way of aggregating information across many features at once.
[Source](https://www.kaggle.com/code/ryanholbrook/creating-features)

## Calculate AoA Features (aoa)

In [11]:
df_aoa.sample(5).T

Unnamed: 0,30369,17824,47614,25667,32409
Word,nom,flibbertigibbet,trustworthy,lamb,paroled
Alternative.spelling,nom,flibbertigibbet,trustworthy,lamb,paroled
Freq_pm,0.33,0.08,1.84,10.63,0.92
Dom_PoS_SUBTLEX,Noun,Noun,Adjective,Noun,Verb
Nletters,3,15,11,4,7
Nphon,3,13,9,3,5
Nsyll,1,5,3,1,2
Lemma_highest_PoS,nom,flibbertigibbet,trustworthy,lamb,parole
AoA_Kup,12.17,13.25,8.05,4.15,
Perc_known,0.67,0.44,1.0,1.0,


In [12]:
def calculate_aoa_feature(text, column, df_aoa, calculation):
    word_list = text.split()  # create word list

    # subset aoa dataframe to get only the words (and values)
    df_temp = df_aoa[(df_aoa['Word'].isin(word_list) | (df_aoa['Alternative.spelling'].isin(word_list)))]
    
    if calculation == 'sum':
        result = df_temp[column].sum()
        if result <= 0:
            #print(result)       # --> enable to see "dirty" samples
            #print(word_list)    # --> enable to see "dirty" samples
            return -1  # returning -1 instead 0, because 0 means no matched words
        else:
          try:
            return result.round(2)
          except:
            print("Failed to return SUM", word_list)
            return -1
            
    elif calculation == 'mean':
        result = df_temp[column].mean() # cannot convert to integer here, because of NAN returns: for no matched words
        if np.isnan(result):
            #print(result)       # --> enable to see "dirty" samples
            #print(word_list)    # --> enable to see "dirty" samples
            return -1
        else:
          try:
            return result.round(2)
          except:
            print("Failed to return MEAN", word_list)
            return -1

    elif calculation == 'max':
        result = df_temp[column].max() # cannot convert to integer here, because of NAN returns: for no matched words
        if np.isnan(result):
            #print(result)       # --> enable to see "dirty" samples
            #print(word_list)    # --> enable to see "dirty" samples
            return -1
        else:
          try:
            return result.round(2)
          except:
            print("Failed to return MAX", word_list)
            return -1


    elif calculation == 'min':
        result = df_temp[column].min() # cannot convert to integer here, because of NAN returns: for no matched words
        if np.isnan(result):
            #print(result)       # --> enable to see "dirty" samples
            #print(word_list)    # --> enable to see "dirty" samples
            return -1
        else:
          try:
            return result.round(2)
          except:
            print("Failed to return MIN", word_list)
            return -1

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [14]:
df_aoa['Dom_PoS_SUBTLEX'] = le.fit_transform(df_aoa['Dom_PoS_SUBTLEX'])

In [15]:
df_aoa.sample(4).T

Unnamed: 0,17042,6228,42079,12127
Word,faxes,cadences,smelling,democrat
Alternative.spelling,faxes,cadences,smelling,democrat
Freq_pm,0.53,0.02,4.96,1.69
Dom_PoS_SUBTLEX,56,56,62,56
Nletters,5,8,8,8
Nphon,6,8,6,8
Nsyll,2,3,2,3
Lemma_highest_PoS,fax,cadence,smell,democrat
AoA_Kup,,,,11.21
Perc_known,,,,1.0


In [16]:
aoa_feature_list = [
    'Freq_pm',
    'Dom_PoS_SUBTLEX',
    'AoA_Kup',
    'Perc_known',
    'AoA_Kup_lem',
    'Perc_known_lem',
    'AoA_Bird_lem',
    'AoA_Bristol_lem',
    'AoA_Cort_lem',
    'AoA_Schock'
]

In [17]:
calculation_list = ['sum', 'mean', 'max', 'min']

Removed first for-loop for "aoa_feature_list" to have more control over debugging and execution progress.

In [None]:
%%time
# Colab Pro:

aoa_feature = 'Freq_pm'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2))   

Calcualting stats for Freq_pm
aoa_freq_pm_sum
for df_wiki_train


100%|██████████| 416768/416768 [46:36<00:00, 149.03it/s]


404254    57722.98
373459    75833.65
Name: aoa_freq_pm_sum, dtype: float64

for df_wiki_test


100%|██████████| 119092/119092 [13:01<00:00, 152.33it/s]


5337     78892.27
31246    34166.20
Name: aoa_freq_pm_sum, dtype: float64

aoa_freq_pm_mean
for df_wiki_train


100%|██████████| 416768/416768 [46:20<00:00, 149.91it/s]


187092    6983.83
41770     3834.28
Name: aoa_freq_pm_mean, dtype: float64

for df_wiki_test


100%|██████████| 119092/119092 [13:33<00:00, 146.43it/s]


92479    4869.24
54381    5420.65
Name: aoa_freq_pm_mean, dtype: float64

aoa_freq_pm_max
for df_wiki_train


100%|██████████| 416768/416768 [46:02<00:00, 150.84it/s]


385500    29449.18
101862    20415.27
Name: aoa_freq_pm_max, dtype: float64

for df_wiki_test


100%|██████████| 119092/119092 [13:26<00:00, 147.63it/s]


33362    29449.18
29305    20415.27
Name: aoa_freq_pm_max, dtype: float64

aoa_freq_pm_min
for df_wiki_train


 65%|██████▌   | 272438/416768 [31:10<15:24, 156.16it/s]

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'Dom_PoS_SUBTLEX'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2))   

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'AoA_Kup'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2))   

Calcualting stats for AoA_Kup
aoa_aoa_kup_sum
for df_wiki_train


100%|██████████| 416768/416768 [47:30<00:00, 146.22it/s]


190780    57.02
52903     54.66
Name: aoa_aoa_kup_sum, dtype: float64

for df_wiki_test


100%|██████████| 119092/119092 [13:24<00:00, 148.10it/s]


79816    135.98
80441     42.57
Name: aoa_aoa_kup_sum, dtype: float64

aoa_aoa_kup_mean
for df_wiki_train


100%|██████████| 416768/416768 [47:51<00:00, 145.16it/s]


224814    5.91
386185    6.08
Name: aoa_aoa_kup_mean, dtype: float64

for df_wiki_test


100%|██████████| 119092/119092 [14:09<00:00, 140.26it/s]


116062    8.03
19727     5.91
Name: aoa_aoa_kup_mean, dtype: float64

aoa_aoa_kup_max
for df_wiki_train


100%|██████████| 416768/416768 [46:40<00:00, 148.84it/s]


107685    10.85
143134    13.68
Name: aoa_aoa_kup_max, dtype: float64

for df_wiki_test


 79%|███████▉  | 94489/119092 [10:39<03:09, 129.53it/s]

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'Perc_known'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2))   

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'AoA_Kup_lem'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2))   

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'Perc_known_lem'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2))   

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'AoA_Bird_lem'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2))   

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'AoA_Bristol_lem'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2)) 

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'AoA_Cort_lem'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2)) 

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
%%time
# Colab Pro:

aoa_feature = 'AoA_Schock'
print("Calcualting stats for", aoa_feature)
for calc in calculation_list:
    column_name = ("aoa_"+aoa_feature+"_"+calc).lower()
    print(column_name)
    
    print("for df_wiki_train")        
    df_wiki_train[column_name] = df_wiki_train['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))

    display(df_wiki_train[column_name].sample(2))

    print("for df_wiki_test")
    df_wiki_test[column_name] = df_wiki_test['cleaned_text'].progress_apply(
        lambda x: calculate_aoa_feature(x, aoa_feature, df_aoa, calc))
    
    display(df_wiki_test[column_name].sample(2)) 

In [None]:
# Export AoA features
column_name = ("aoa_"+aoa_feature+"_").lower()
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith(column_name)])
print(feature_columns)

df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"train_features_{column_name}.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+f"test_features_{column_name}.csv", index=False)

In [None]:
df_wiki_train.sample(1).T

In [None]:
df_wiki_train.sample(1).T

# Export

In [None]:
columns = df_wiki_train.columns.to_list()
feature_columns = sorted([x for x in columns if x.startswith("aoa_")])

print(len(feature_columns))
print(feature_columns)

In [None]:
# Export AoA features
df_export = df_wiki_train[feature_columns]
df_export.to_csv(PATH_DATA_INT+"train_features_aoa.csv", index=False)

df_export = df_wiki_test[feature_columns]
df_export.to_csv(PATH_DATA_INT+"test_features_aoa.csv", index=False)

# Watermark

In [None]:
%watermark

In [None]:
%watermark --iversions

In [None]:
t_end = time.time()
total_runtime = t_end-t_start
total_runtime_min = round((total_runtime/60),2)
print(str(total_runtime_min)+" minutes")

In [None]:
send_push(f"Calculate AoA features finished in: {total_runtime_min}")