In [2]:
import os, sys, gc, re, time
import numpy as np, pandas as pd

%reload_ext autoreload
%autoreload 2

sys.path.append(os.path.realpath("..\..")) # Adds higher directory to python modules path.
from my_NLP_RNN_fr_lib.tweet_utils import load_review_fr, load_tweet_fr, \
                                    clean_tweet_fr, TWEET_MAX_CHAR_COUNT
from my_NLP_RNN_fr_lib.tokenizer_utils import tokenize_tweet_fr
from my_NLP_RNN_fr_lib.fastText import read_fastText_vecs

<div style="text-align: justify">
In the herein short notebook, we load the french-text fastText dictionnary we use in the global context of our NLP model use case. What we especially do, here, is "shrink" the vocabs so that it includes only tokens that do exist is our training dataset. By doing that, we lower the footprint of our RNN embeddings layer in memory.
</div>
<br />
<div style="text-align: justify">
BEWARE&nbsp;: We only use the "shrinked" version of our vocab during the model optimization stage. If we were to use that version of the french vocab in production, it would totally annihilate the "generalizing" power of our NLP model. It would in such circumstances be unable to generalize to words it hadn't seen during training time.
</div>


# REVIEWS

Lets start by identifying all word tokens appearing in our transfer-learning "French Reviews" dataset&nbsp;:

In [3]:
# loading
reviews = load_review_fr()

Loaded the dataset in 0.8286 seconds


In [4]:
# preprocessing
reviews = clean_tweet_fr(reviews, col_name = 'comment')

In [5]:
# tokenization
tokenized_reviews = tokenize_tweet_fr(reviews, col_name='comment')

In [6]:
# filtering-out lengthy elements (if any)
tokenized_reviews.drop( tokenized_reviews[tokenized_reviews.map(
    lambda tokens: len(tokens)) > TWEET_MAX_CHAR_COUNT].index, inplace=True )
tokenized_reviews.reset_index(drop=True, inplace=True)

In [7]:
print("{:,}".format(tokenized_reviews.shape[0]) + " reviews.")

173,014 reviews.


In [8]:
# flattening
reviews_tokens = tokenized_reviews.apply(pd.Series).stack()

Our "French Reviews" training dataset encapsulates {{"{:,}".format(len(reviews_tokens))}} (non-unique) tokens.

In [9]:
# cleaning, garbage collection
try : del reviews, tokenized_reviews ; dummy = gc.collect()
except NameError : pass


# TWEETS

Lets now do the same with our taget "French Tweets" dataset by identifying all word tokens appearing in there&nbsp;:

In [10]:
# loading
tweets = load_tweet_fr()

Loaded the dataset in 2.8951 seconds


In [11]:
# preprocessing
tweets = clean_tweet_fr(tweets)

In [12]:
# tokenization
tokenized_tweets = tokenize_tweet_fr(tweets)
#del tweets ; gc.collect()

In [13]:
# filtering-out lengthy elements (if any)
tokenized_tweets.drop( tokenized_tweets[tokenized_tweets.map(
    lambda tokens: len(tokens)) > TWEET_MAX_CHAR_COUNT].index, inplace=True )
tokenized_tweets.reset_index(drop=True, inplace=True)

In [14]:
# flattening
tweets_tokens = tokenized_tweets.apply(pd.Series).stack()

Our "French Tweets" training dataset encapsulates {{"{:,}".format(len(tweets_tokens))}} (non-unique) tokens.

In [15]:
# cleaning, garbage collection
try : del tweets, tokenized_tweets ; silent = gc.collect()
except : pass

# MERGED

The next step now consists in merging the two sub-ensembles and extract a list of unique tokens&nbsp;:

In [16]:
tokens = pd.concat([reviews_tokens.astype("str"), tweets_tokens.astype("str")], axis=0)

In [17]:
tokens = tokens.reset_index(drop=True).drop_duplicates().reset_index(drop=True)

Our entire training dataset encapsulates <b><u>{{"{:,}".format(len(tokens))}}</u></b> unique tokens.

In [18]:
# cleaning, garbage collection
try : del reviews_tokens, tweets_tokens ; gc.collect()
except : pass

In [19]:
# turning a 'dict' object into a DataFrame
# for further manipulating down the road
# (dict_keys using unpacking => [*tokens])
tokens_df = pd.DataFrame([*tokens])
tokens_df.columns=['tokens']

In [20]:
# display a sample of the training unique tokens
tokens_df[0:16].transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
tokens,Une,cuisine,japonaise,sans,originalités,particulières,mais,la,meilleure,du,genre,.,Pour,de,tels,prix



# VOCAB

The logical following step consists in loading our original full-size vocabulary&nbsp;:

In [21]:
word_to_index, index_to_word, word_to_vec_map = \
    read_fastText_vecs( os.path.join(
        os.path.realpath("..\..")
        , 'data', 'fastText_french', 'cc.fr.300.vec') )

Loaded the Vocabulary in 177.9774 seconds [1,234,042 word embeddings of 300 features]


In [22]:
# turning a 'dict' object into a DataFrame
# for further manipulating down the road
# (dict_keys using unpacking => [*word_to_index])
vocab_df = pd.DataFrame([*word_to_index])
vocab_df.columns=['tokens']

In [23]:
# display a sample of the vocabulary tokens
vocab_df[0:16].transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
tokens,!,"""",#,$,%,&,',(,),*,+,",",-,.,...,/


# FILTERING

<div style="text-align: justify">
Let us join the set of unique training tokens with our vocabulary, to only retain the intersection. Notice below how some of the tokens of our training dataset are absent from the original full-size french vocabulary. Most of these are indeed misspelled words. For such tokens, we adjunct an "<em>&lt;UKN&gt;</em>" ("unknown") token to the vocabulary when used in conjunction with an NLP model &nbsp;:
</div>

In [44]:
from my_NLP_RNN_fr_lib.tweet_utils import html_escape
from my_NLP_RNN_fr_lib.display_helper import dataframe_pretty_print_center

# sample of tokens in our datasets BUT not in our vocab =>
result_df = \
    pd.DataFrame({'tokens' :
                  [html_escape(token) for token in tokens_df.tokens[~tokens_df.tokens.isin(vocab_df.tokens)].values[0:16]]})

dataframe_pretty_print_center(result_df)

tokens
bâteux
alése
reviderai
thecniquement
impretioner
girardet
passedat
disctétion
fellini
Marqueting


# EMBEDDING VECTORS

Finally, we can now retrieve the filtered word vector coordinates in the space of {{len(word_to_vec_map['bonjour'])}} features&nbsp;:

In [35]:
word_to_vec_map_light = \
    dict((k, word_to_vec_map[k])
        for k in tokens_df.tokens[tokens_df.tokens.isin(vocab_df.tokens)]
         if k in word_to_vec_map)

<div style="text-align: justify">
    There are <b><u>{{"{:,}".format(len(word_to_vec_map_light))}}</u></b> entries in the light version of the dictionnary that we just created. Notice again how this represent less than the number of tokens of our training dataset.
</div>

With that, it is at last possible to record a local "light" version of the original french vocabulary&nbsp;:

In [36]:
tic = time.perf_counter()
str_list = []
counter = 0
with open(
    os.path.join(
        os.path.realpath("..\..")
        , 'data', 'fastText_french', 'cc.fr.300.light.vec'
    ) , 'w', encoding="utf-8") as f:
    for token, vector in word_to_vec_map_light.items():
        str_list.append(str(token) + " " + ' '.join(str(x) for x in vector) + '\n')
        counter += 1
        if counter % 50000 == 0 :
            f.write(''.join(str_list))
            str_list = []
    if counter % 50000 != 0 :
        f.write(''.join(str_list))
toc = time.perf_counter()
print(f"Saved the LIGHT Vocabulary locally in {toc - tic:0.4f} seconds")

Saved the LIGHT Vocabulary locally in 22.6517 seconds


<hr style="height: 4px; width: 70%; margin:0 auto;" />

The so-created vocab can then be loaded the standard way as follows&nbsp;:

<center><div style="background-color: #f0f0f0; width: 600px; text-align: left;"><code>word_to_index_light, index_to_word_light, word_to_vec_map_light = \
    read_fastText_vecs( os.path.join(
        os.path.realpath("..\..")
    , 'data', 'fastText_french', 'cc.fr.300<b>.light</b>.vec') )
</code></div></center>