In [1]:
import os

# move to project root
while True:
    # get list of directories
    dirs = os.listdir()
    if "README.md" in dirs:
        break
    else:
        os.chdir("..")

# then move to feature extraction
os.chdir("feature_extraction")

print(os.getcwd())

/mnt/antares_raid/home/bramantyos/codes/timescales_filtering/feature_extraction


In [16]:
from utils import load_story_info
from hard_coded_things import featuresets_dict, train_stories, test_stories, train_stories_zh, test_stories_zh

import numpy as np

from tqdm.notebook import trange
from typing import Optional
from scipy.signal import periodogram

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from config import config_plotting

from features import Features, get_contextual_embeddings
from utils import load_generic_trfiles, load_grids_for_stories

In [4]:
config_plotting(context="paper", palette="muted")

In [41]:
en_grid_dir = "../data/bling/grids_w_punctuation/en"
zh_grid_dir = "../data/bling/grids_w_punctuation/zh"

en_trfile_dir = "../data/bling/COL/trfiles/en/" 
zh_trfile_dir = "../data/bling/COL/trfiles/zh/"

In [47]:
def remove_punctuation(text: str):
    return (
        text.replace(".", "")
        .replace(",", "")
        .replace("!", "")
        .replace("?", "")
        .replace(";", "")
        .replace(":", "")
        .replace("。", "")
    )

In [48]:
en_words = []
zh_words = []

for story in train_stories:
    print('Processing story:', story)
    en_grid_train = load_grids_for_stories([story], en_grid_dir,)
    zh_grid_train = load_grids_for_stories([story], zh_grid_dir)

    en_trfile_train = load_generic_trfiles([story], en_trfile_dir)
    zh_trfile_train = load_generic_trfiles([story], zh_trfile_dir)
    
    en_features = Features(en_grid_train, en_trfile_train)
    zh_features = Features(zh_grid_train, zh_trfile_train)  
    
    en_story_words = en_features.wordseqs_with_sentence_boundaries[story].data
    zh_story_words = zh_features.wordseqs_with_sentence_boundaries[story].data
    
    en_words.extend(en_story_words)
    zh_words.extend(zh_story_words)
    

# remove punctuation
en_words = [remove_punctuation(word) for word in en_words]
zh_words = [remove_punctuation(word) for word in zh_words]

# remove duplicates
en_words = list(set(en_words))
zh_words = list(set(zh_words))

Processing story: alternateithicatom
load_grids_for_stories ['alternateithicatom'] ../data/bling/grids_w_punctuation/en
load_grids_for_stories ['alternateithicatom'] ../data/bling/grids_w_punctuation/zh
Processing story: avatar
load_grids_for_stories ['avatar'] ../data/bling/grids_w_punctuation/en
load_grids_for_stories ['avatar'] ../data/bling/grids_w_punctuation/zh
Processing story: howtodraw
load_grids_for_stories ['howtodraw'] ../data/bling/grids_w_punctuation/en
load_grids_for_stories ['howtodraw'] ../data/bling/grids_w_punctuation/zh
Processing story: legacy
load_grids_for_stories ['legacy'] ../data/bling/grids_w_punctuation/en
load_grids_for_stories ['legacy'] ../data/bling/grids_w_punctuation/zh
Processing story: life
load_grids_for_stories ['life'] ../data/bling/grids_w_punctuation/en
load_grids_for_stories ['life'] ../data/bling/grids_w_punctuation/zh
Processing story: myfirstdaywiththeyankees
load_grids_for_stories ['myfirstdaywiththeyankees'] ../data/bling/grids_w_punctuati

In [49]:
# save to file
with open("en_words.txt", "w") as f:
    for word in en_words:
        f.write(word + "\n")
        
with open("zh_words.txt", "w") as f:
    for word in zh_words:
        f.write(word + "\n")

In [9]:
en_features = Features(en_grid, en_trfile)

In [12]:
word_seqs = en_features.wordseqs_with_sentence_boundaries

In [23]:
en_ds = get_contextual_embeddings(
    word_seqs["wheretheressmoke"],
    layer_num=8,
    split_type="causal_all",
    max_seq_length=10,
    use_special_tokens=True,
)
interp = en_ds.chunksums(interp="lanczos")[10:-5]

English text
Extracting embeddings from bert-base-uncased using causal_all split type. 2007 input sequences.
Doing lanczos interpolation with cutoff=0.499 and 3 lobes.


In [24]:
en_mbert_ds = get_contextual_embeddings(
    word_seqs["wheretheressmoke"],
    layer_num=8,
    split_type="causal_all",
    max_seq_length=10,
    model_name = "bert-base-multilingual-uncased",
    use_special_tokens=False,
)

mbert_interp = en_mbert_ds.chunksums(interp="lanczos")[10:-5]

English text
Extracting embeddings from bert-base-multilingual-uncased using causal_all split type. 2007 input sequences.
Doing lanczos interpolation with cutoff=0.499 and 3 lobes.


In [14]:
BERT_moth = np.load('/mnt/raid/bling/data/features/moth_reading/BertFeat_moth_reading_CHE_BERT_l0-to-l12_c10_noavg_nospetok.npz', allow_pickle=True)
BERT_bling = np.load('/mnt/raid/bling/data/features/bling_reading/BertFeat_bling_reading_en_COL_BERT_l0-to-l12_c10_noavg_nospetok.npz', allow_pickle=True)
mBERT_moth = np.load('/mnt/raid/bling/data/features/moth_reading/BertMultiFeat_moth_reading_CHE_multilingualBERT_l0-to-l12_c10_noavg_nospetok.npz', allow_pickle=True)
mBERT_bling = np.load('/mnt/raid/bling/data/features/bling_reading/BertMultiFeat_bling_reading_en_COL_multilingualBERT_l0-to-l12_c10_noavg_nospetok.npz', allow_pickle=True)

In [33]:
bert_moth_8 = BERT_moth['test_features'].tolist()['BERT_8'][0]
bert_bling_8 = BERT_bling['test_features'].tolist()['BERT_8'][0]

In [27]:
# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
bert_moth_8

array([[ 0.95540366, -0.93575674, -1.0521592 , ..., -1.49934071,
        -1.07200984, -0.79828725],
       [ 0.45890243, -0.37319425, -0.58002195, ..., -0.50412602,
        -1.22558735, -1.32222446],
       [-0.45752433, -0.78526061, -0.53213749, ...,  0.18404357,
        -1.27370402,  0.30574224],
       ...,
       [ 0.62204686,  0.12377637,  0.02341008, ..., -0.97267267,
        -0.45797148,  0.13009126],
       [-1.01311025,  1.43927007,  0.44532056, ...,  0.11970278,
        -0.82238755, -0.39655882],
       [-0.05446936, -0.75081039, -0.9589614 , ...,  1.19362228,
        -1.31228829, -0.33641014]])

In [34]:
bert_bling_8

array([[ 0.95540432, -0.93575695, -1.05215813, ..., -1.49933939,
        -1.07201049, -0.79828698],
       [ 0.4589009 , -0.37319598, -0.58002011, ..., -0.50412585,
        -1.2255862 , -1.3222242 ],
       [-0.45752407, -0.78525995, -0.53213589, ...,  0.18404178,
        -1.27370358,  0.30573982],
       ...,
       [ 0.62204784,  0.12377702,  0.0234062 , ..., -0.97266922,
        -0.45797104,  0.13009211],
       [-1.01310825,  1.43926923,  0.44531999, ...,  0.11970295,
        -0.82238739, -0.39655904],
       [-0.05447093, -0.75081062, -0.95896271, ...,  1.19362332,
        -1.31228836, -0.33640659]])

In [36]:
cossim = cosine_similarity(bert_moth_8, bert_bling_8,)

import seaborn as sns

In [18]:
bert_moth_8.shape

(291, 768)