### <span style='color:blue'>**Mandatory Prepare Step**</span>: Setup Notebook
The following code cell must to be executed once for each user session. The step loads utility Python code stored in separate files, and imports dependencies to external libraries.


In [30]:
# Setup
%load_ext autoreload
%autoreload 2

import sys
import os

if '..' not in sys.path: sys.path.insert(1, '..')

from common.treaty_state import load_wti_index

import ipywidgets as widgets
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import treaty_corpus
import nltk

logger = utility.getLogger('corpus_text_analysis')

wti_index = load_wti_index(data_folder='../data')

source_folder = '../data'
source_path = os.path.join(source_folder, 'treaty_text_corpora_20181018.zip')

treaties = wti_index
corpus_stream = treaty_corpus.CompressedFileReader(source_path, pattern='*.txt')

corpus = treaty_corpus.TreatyCorpus(corpus_stream)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2018-10-21 11:23:28,768 : INFO : Data loaded!


In [33]:
period_group = config.PERIOD_GROUPS_ID_MAP['years_1945-1972']

treaties = wti_index.get_treaties_within_division(
    period_group=period_group,
    treaty_filter='is_cultural',
    recode_is_cultural=False,
    parties=None
)

In [204]:
source_folder = '../data'
source_path = os.path.join(source_folder, 'treaty_text_corpora_20181018.zip')

treaties = wti_index
corpus_stream = treaty_corpus.CompressedFileReader(source_path, pattern='*.txt')


# corpus = treaty_corpus.TreatyCorpus(stream)


2018-10-19 15:28:57,085 : INFO : Initializing dictionary
2018-10-19 15:28:57,103 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-10-19 15:29:04,148 : INFO : built Dictionary(20604 unique tokens: ['nucleus', 'sokpulsa', 'ltalo', 'intervenus', 'kattner']...) from 721 documents (total 934111 corpus positions)


### Task: Basic Corpus Statistics
See https://www.nltk.org/book/ch01.html

* Size of treaties over time
* Unique word, unique words per word class
* Lexical diversity
* Frequency distribution
* Average word length, sentence length


```python
 	
>>> len(texts) / count(docs)
0.06230453042623537
>>>

>>> len(set(text3)) / len(text3)
0.06230453042623537
>>>

>>> > def lexical_diversity(text): [1]
...     return len(set(text)) / len(text) [2]
...
>>> def percentage(count, total): [3]
...     return 100 * count / total

# Most common words
fdist1 = FreqDist(text1)
fdist1.most_common(50)

# Word length frequencies
>>> fdist = FreqDist(len(w) for w in text1)  [2]
>>> print(fdist)

```


https://colab.research.google.com/github/mdda/deep-learning-workshop/blob/master/notebooks/5-RNN/3-Text-Corpus-and-Embeddings.ipynb#scrollTo=jKrHrM7yKGwB

## GloVe Word Embeddings
Using the python package :  https://github.com/maciejkula/glove-python , and code samples from : http://developers.lyst.com/2014/11/11/word-embeddings-for-fashion/

### Create the Co-occurrence Matrix
For speed, this looks at the first 100,000 tokens in the corpus - and should create the co-occurences in 30 seconds or so.

### <span style='color: red'>WORK IN PROGRESS</span> Task: Treaty Keyword Extraction (using TF-IDF weighing)
- [ML Wiki.org](http://mlwiki.org/index.php/TF-IDF)
- [Wikipedia](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)
- Spärck Jones, K. (1972). "A Statistical Interpretation of Term Specificity and Its Application in Retrieval".
- Manning, C.D.; Raghavan, P.; Schutze, H. (2008). "Scoring, term weighting, and the vector space model". ([PDF](http://nlp.stanford.edu/IR-book/pdf/06vect.pdf))
- https://markroxor.github.io/blog/tfidf-pivoted_norm/
$\frac{tf-idf}{\sqrt(rowSums( tf-idf^2 ) )}$
- https://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html

Neural Network Methods in Natural Language Processing, Yoav Goldberg:
![image.png](attachment:image.png)

In [18]:
# Code
from scipy.sparse import csr_matrix
%timeit

    
def get_top_tfidf_words(data, n_top=5):
    top_list = data.groupby(['treaty_id'])\
        .apply(lambda x: x.nlargest(n_top, 'score'))\
        .reset_index(level=0, drop=True)
    return top_list

def compute_tfidf_scores(corpus, dictionary, smartirs='ntc'):
    #model = gensim.models.logentropy_model.LogEntropyModel(corpus, normalize=True)
    model = gensim.models.tfidfmodel.TfidfModel(corpus, dictionary=dictionary, normalize=True) #, smartirs=smartirs)
    rows, cols, scores = [], [], []
    for r, document in enumerate(corpus): 
        vector = model[document]
        c, v = zip(*vector)
        rows += (len(c) * [ int(r) ])
        cols += c
        scores += v
        
    return csr_matrix((scores, (rows, cols)))
    
if True: #'tfidf_cache' not in globals():
    tfidf_cache = {
    }
    
def display_tfidf_scores(source_folder, language, period, n_top=5, threshold=0.001):
    
    global state, tfw, tfidf_cache
    
    try:
        treaties = state.treaties

        tfw.progress.value = 0
        tfw.progress.value += 1
        if language[0] not in tfidf_cache.keys():
            corpus = TreatyCorpusSaveLoad(source_folder=source_folder, lang=language[0])\
                .load_mm_corpus(normalize_by_D=True)
            document_names = corpus.document_names
            dictionary = corpus.dictionary
            _ = dictionary[0]

            tfw.progress.value += 1
            A = compute_tfidf_scores(corpus, dictionary)

            tfw.progress.value += 1
            scores = pd.DataFrame(
                [ (i, j, dictionary.id2token[j], A[i, j]) for i, j in zip(*A.nonzero())],
                columns=['document_id', 'token_id', 'token', 'score']
            )
            tfw.progress.value += 1
            scores = scores.merge(document_names, how='inner', left_on='document_id', right_index=True)\
                .drop(['document_id', 'token_id', 'document_name'], axis=1)

            scores = scores[['treaty_id', 'token', 'score']]\
                .sort_values(['treaty_id', 'score'], ascending=[True, False])

            tfidf_cache[language[0]] = scores

        scores = tfidf_cache[language[0]]
        if threshold > 0:
            scores = scores.loc[scores.score >= threshold]

        tfw.progress.value += 1

        #scores = get_top_tfidf_words(scores, n_top=5)
        #scores = scores.groupby(['treaty_id']).sum() 

        scores = scores.groupby(['treaty_id'])\
            .apply(lambda x: x.nlargest(n_top, 'score'))\
            .reset_index(level=0, drop=True)\
            .set_index('treaty_id')

        if period is not None:
            periods = state.treaties[period]
            scores = scores.merge(periods.to_frame(), left_index=True, right_index=True, how='inner')\
                .groupby([period, 'token']).score.agg([np.mean])\
                .reset_index().rename(columns={0:'score'}) #.sort_values('token')

        #['token'].apply(' '.join)

        display(scores)
    except Exception as ex:
        logger.error(ex)
        
    tfw.progress.value = 0

#if 'tfidf_scores' not in globals():
#    tfidf_scores = compute_document_tfidf(corpus, corpus.dictionary, state.treaties)
#    tfidf_scores = tfidf_scores.sort_values(['treaty_id', 'score'], ascending=[True, False])

tfw = BaseWidgetUtility(
    language=widgets.Dropdown(
        options={
            'English': ('en', 'english'),
            'French': ('fr', 'french'),
            'German': ('de', 'german'),
            'Italian': ('it', 'italian')
        },
        value=('en', 'english'),
        description='Language:', **drop_style
    ),
    remove_stopwords=widgets.ToggleButton(
        description='Remove stopwords', value=True,
        tooltip='Do not include stopwords in token toplist', **toggle_style
    ),    
    n_top=widgets.IntSlider(
        value=5, min=1, max=25, step=1,
        description='Top #:',
        continuous_update=False
    ),
    threshold=widgets.FloatSlider(
        value=0.001, min=0.0, max=0.5, step=0.01,
        description='Threshold:',
        tooltip='Word having a TF-IDF score below this value is filtered out',
        continuous_update=False,
        readout_format='.3f',
    ), 
    period=widgets.Dropdown(
        options={
            '': None,
            'Year': 'signed_year',
            'Default division': 'signed_period',
            'Alt. division': 'signed_period_alt'
        },
        value='signed_period',
        description='Period:', **drop_style
    ),
    output=widgets.Dropdown(
        options={
            '': None,
            'Year': 'signed_year',
            'Default division': 'signed_period',
            'Alt. division': 'signed_period_alt'
        },
        value='signed_period',
        description='Output:', **drop_style
    ),
    progress=widgets.IntProgress(min=0, max=5, step=1, value=0) #, layout=widgets.Layout(width='100%')),
)

itfw = widgets.interactive(
    display_tfidf_scores,
    source_folder='./data',
    language=tfw.language,
    n_top=tfw.n_top,
    threshold=tfw.threshold,
    period=tfw.period
)

boxes = widgets.HBox(
    [
        widgets.VBox([tfw.language, tfw.period]),
        widgets.VBox([tfw.n_top, tfw.threshold]),
        widgets.VBox([tfw.progress, tfw.output])
    ]
)

display(widgets.VBox([boxes, itfw.children[-1]]))
itfw.update()
