# Vector Semantics Assignment

In [1]:
# Alexandria Benedict, Vector Semantics Assignment

### Imports

In [137]:
from __future__ import division
import itertools
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import scipy.sparse
import time


from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE as tsne

## Import Dataset

In [180]:
#Take in data
df1 = pd.read_csv('CONcreTEXT_trial_EN.tsv', sep='\t', header=None, engine= 'python')
df2 = pd.read_csv('CONcreTEXT_trial_IT.tsv', sep='\t', header=None, engine= 'python')

#Get list of sentences
df1.head()
corpus = list(df1[3][1:])
itSent = list(df2[3][1:])


print('Sentences:', len(corpus))

for sent in corpus[:5]:
    print(sent)
    
#print(corpus)


Sentences: 100
Bring up academic achievements , awards , and other milestones in your life . 
Please list people you have helped , your personal achievements , or troublesome times you have overcome . 
Add activated carbon straight to your vodka . 
Place sensors around your garden , and when a cat comes in , the motion activates a sensor . 
Look for a partner that shares your level of adventure in pursuing new types of experiences . 


### Corpus utilities



In [173]:
#@title Utilities
import re
import time
import itertools
import numpy as np

# For pretty-printing
import pandas as pd
from IPython.display import display, HTML

UNK_TOKEN   = u"<unk>"

def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))

def pretty_print_matrix(M, rows=None, cols=None, dtype=float, float_fmt="{0:.04f}"):
    """Pretty-print a matrix using Pandas.

    Args:
      M : 2D numpy array
      rows : list of row labels
      cols : list of column labels
      dtype : data type (float or int)
      float_fmt : format specifier for floats
    """
    df = pd.DataFrame(M, index=rows, columns=cols, dtype=dtype)
    old_fmt_fn = pd.get_option('float_format')
    pd.set_option('float_format', lambda f: float_fmt.format(f))
    display(df)
    pd.set_option('float_format', old_fmt_fn)  # reset Pandas formatting

def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)


##
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

##
# Data loading functions
def get_corpus(name="brown"):
    import nltk
    assert(nltk.download(name))
    return nltk.corpus.__getattr__(name)

def build_vocab(corpus, V=10000):
    import vocabulary
    token_feed = (canonicalize_word(w) for w in corpus.words())
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab

def get_train_test_sents(corpus, split=0.8, shuffle=True):
    """Generate train/test split for unsupervised tasks.

    Args:
      corpus: nltk.corpus that supports sents() function
      split (double): fraction to use as training set
      shuffle (int or bool): seed for shuffle of input data, or False to just
      take the training data as the first xx% contiguously.

    Returns:
      train_sentences, test_sentences ( list(list(string)) ): the train and test
      splits
    """
    sentences = np.array(list(corpus.sents()), dtype=object)
    fmt = (len(sentences), sum(map(len, sentences)))
    print ("Loaded {:,} sentences ({:g} tokens)".format(*fmt))

    if shuffle:
        rng = np.random.RandomState(shuffle)
        rng.shuffle(sentences)  # in-place
    train_frac = 0.8
    split_idx = int(train_frac * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    fmt = (len(train_sentences), sum(map(len, train_sentences)))
    print( "Training set: {:,} sentences ({:,} tokens)".format(*fmt))
    fmt = (len(test_sentences), sum(map(len, test_sentences)))
    print("Test set: {:,} sentences ({:,} tokens)".format(*fmt))

    return train_sentences, test_sentences

def preprocess_sentences(sentences, vocab, use_eos=False, emit_ids=True):
    """Preprocess sentences by canonicalizing and mapping to ids.

    Args:
      sentences ( list(list(string)) ): input sentences
      vocab: Vocabulary object, already initialized
      use_eos: if true, will add </s> token to end of sentence.
      emit_ids: if true, will emit as ids. Otherwise, will be preprocessed
          tokens.

    Returns:
      ids ( array(int) ): flattened array of sentences, including boundary <s>
      tokens.
    """
    # Add sentence boundaries, canonicalize, and handle unknowns
    word_preproc = lambda w: canonicalize_word(w, wordset=vocab.word_to_id)
    ret = []
    
    for s in sentences:
        canonical_words = vocab.pad_sentence(map(word_preproc, s), use_eos=use_eos)
        ret.extend(vocab.words_to_ids(canonical_words) if emit_ids else
                   canonical_words)
    if not use_eos:  # add additional <s> to end if needed
        ret.append(vocab.START_ID if emit_ids else vocab.START_TOKEN)
    return np.array(ret, dtype=(np.int32 if emit_ids else object))


def load_corpus(corpus, split=0.8, V=10000, shuffle=0):
    """Load a named corpus and split train/test along sentences.

    This is a convenience wrapper to chain together several functions from this
    module, and produce a train/test split suitable for input to most models.

    Sentences are preprocessed by canonicalization and converted to ids
    according to the constructed vocabulary, and interspersed with <s> tokens
    to denote sentence bounaries.

    Args:
        corpus: (string | corpus reader) If a string, will fetch the
            NLTK corpus of that name.
        split: (float \in (0,1]) fraction of examples in train split
        V: (int) vocabulary size (including special tokens)
        shuffle: (int) if > 0, use as random seed to shuffle sentence prior to
            split. Can change this to get different splits.

    Returns:
        (vocab, train_ids, test_ids)
        vocab: vocabulary.Vocabulary object
        train_ids: flat (1D) np.array(int) of ids
        test_ids: flat (1D) np.array(int) of ids
    """
    if isinstance(corpus, str):
        corpus = get_corpus(corpus)
    vocab = build_vocab(corpus, V)
    train_sentences, test_sentences = get_train_test_sents(corpus, split, shuffle)
    train_ids = preprocess_sentences(train_sentences, vocab)
    test_ids = preprocess_sentences(test_sentences, vocab)
    return vocab, train_ids, test_ids

##
# Window and batch functions
def rnnlm_batch_generator(ids, batch_size, max_time):
    """Convert ids to data-matrix form for RNN language modeling."""
    # Clip to multiple of max_time for convenience
    clip_len = ((len(ids)-1) / batch_size) * batch_size
    input_w = ids[:clip_len]     # current word
    target_y = ids[1:clip_len+1]  # next word
    # Reshape so we can select columns
    input_w = input_w.reshape([batch_size,-1])
    target_y = target_y.reshape([batch_size,-1])

    # Yield batches
    for i in xrange(0, input_w.shape[1], max_time):
        yield input_w[:,i:i+max_time], target_y[:,i:i+max_time]


def build_windows(ids, N, shuffle=True):
    """Build window input to the window model.

    Takes a sequence of ids, and returns a data matrix where each row
    is a window and target for the window model. For N=3:
        windows[i] = [w_3, w_2, w_1, w_0]

    For language modeling, N is the context size and you can use y = windows[:,-1]
    as the target words and x = windows[:,:-1] as the contexts.

    For CBOW, N is the window size and you can use y = windows[:,N/2] as the target words
    and x = np.hstack([windows[:,:N/2], windows[:,:N/2+1]]) as the contexts.

    For skip-gram, you can use x = windows[:,N/2] as the input words and y = windows[:,i]
    where i != N/2 as the target words.

    Args:
      ids: np.array(int32) of input ids
      shuffle: if true, will randomly shuffle the rows

    Returns:
      windows: np.array(int32) of shape [len(ids)-N, N+1]
        i.e. each row is a window, of length N+1
    """
    windows = np.zeros((len(ids)-N, N+1), dtype=int)
    for i in xrange(N+1):
        # First column: first word, etc.
        windows[:,i] = ids[i:len(ids)-(N-i)]
    if shuffle:
        # Shuffle rows
        np.random.shuffle(windows)
    return windows


def batch_generator(data, batch_size):
    """Generate minibatches from data.

    Args:
      data: array-like, supporting slicing along first dimension
      batch_size: int, batch size

    Yields:
      minibatches of maximum size batch_size
    """
    for i in xrange(0, len(data), batch_size):
        yield data[i:i+batch_size]

In [125]:
#@title Vocabulary helper functions
import collections
from collections import defaultdict

class Vocabulary(object):

    START_TOKEN = u"<s>"
    END_TOKEN   = u"</s>"
    UNK_TOKEN   = u"<unk>"

    def __init__(self, tokens, size=None):
        """Create a Vocabulary object.

        Args:
            tokens: iterator( string )
            size: None for unlimited, or int > 0 for a fixed-size vocab.
                  Vocabulary size includes special tokens <s>, </s>, and <unk>
        """
        self.unigram_counts = collections.Counter(tokens)
        self.bigram_counts = defaultdict(lambda: defaultdict(lambda: 0))
        word1 = None
        for word in tokens:
            if word1 is None:
                pass
            self.bigram_counts[word1][word] += 1
            word1 = word
        self.bigram_counts.default_factory = None  # make into a normal dict

        # Leave space for "<s>", "</s>", and "<unk>"
        top_counts = self.unigram_counts.most_common(None if size is None else (size - 3))
        vocab = ([self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN] +
                 [w for w,c in top_counts])

        # Assign an id to each word, by frequency
        self.id_to_word = dict(enumerate(vocab))
        self.word_to_id = {v:k for k,v in self.id_to_word.items()}
        self.size = len(self.id_to_word)
        if size is not None:
            assert(self.size <= size)

        # For convenience
        self.wordset = set(self.word_to_id.keys())

        # Store special IDs
        self.START_ID = self.word_to_id[self.START_TOKEN]
        self.END_ID = self.word_to_id[self.END_TOKEN]
        self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

    def words_to_ids(self, words):
        return [self.word_to_id.get(w, self.UNK_ID) for w in words]

    def ids_to_words(self, ids):
        return [self.id_to_word[i] for i in ids]

    def pad_sentence(self, words, use_eos=True):
        ret = [self.START_TOKEN] + words
        if use_eos:
            ret.append(self.END_TOKEN)
        return ret

    def sentence_to_ids(self, words, use_eos=True):
        return self.words_to_ids(self.pad_sentence(words, use_eos))

    def ordered_words(self):
        """Return a list of words, ordered by id."""
        return self.ids_to_words(range(self.size))

## Question 1
Create the vocabulary, create co-occurrence matrix, then use PPMI function

In [182]:
# A function that produces a sparse co-occurrence matrix given a corpus,
# a vocabulary size V, and K (the context window is +-K).
def co_occurrence_matrix(token_ids, V, K=3):
    # We'll use this as an "accumulator" matrix.
    C = scipy.sparse.csc_matrix((V,V), dtype=np.float32)

    for k in range(1, K+1):
        print( u'Counting pairs (i, i \u00B1 %d) ...' %k)
        i = token_ids[:-k]  # current word
        j = token_ids[k:]   # k words ahead
        data = (np.ones_like(i), (i,j))  # values, indices
        Ck_plus = scipy.sparse.coo_matrix(data, shape=C.shape, dtype=np.float32)
        Ck_plus = scipy.sparse.csc_matrix(Ck_plus)
        Ck_minus = Ck_plus.T  # consider k words behind
        C += Ck_plus + Ck_minus

    print( "Co-occurrence matrix: %d words x %d words" %C.shape)
    print( "  %.02g nonzero elements" %C.nnz)
    return C

In [183]:
corpus2 = []
for sentence in corpus:
    words += sentence.split(" ")
    corpus2.append(sentence.split(" "))
    
# Get vocab, tokens, and token_ids as above.
vocab = Vocabulary(canonicalize_word(w)
                       for w in flatten(corpus2))
tokens = preprocess_sentences(corpus2, vocab,
                                  use_eos=False, emit_ids=False)
token_ids = vocab.words_to_ids(tokens)

# Build the co-occurrence matrix.
C = co_occurrence_matrix(token_ids, vocab.size, K=3)

# Display a table with the counts. The .toarray() function converts the
# sparse matrix into a dense one.
labels = vocab.ordered_words()
pretty_print_matrix(C.toarray(), rows=labels,
                    cols=labels, dtype=int)


Counting pairs (i, i ± 1) ...
Counting pairs (i, i ± 2) ...
Counting pairs (i, i ± 3) ...
Co-occurrence matrix: 652 words x 652 words
  7.4e+03 nonzero elements


Unnamed: 0,<s>,</s>,<unk>,.,Unnamed: 5,the,",",you,a,to,...,wins,men,women,wear,same,shoes,woman,whom,she,speaking
<s>,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
</s>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
<unk>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
.,1,0,0,0,100,26,2,20,14,8,...,1,0,0,0,0,1,0,0,1,1
,1,0,0,100,0,30,5,16,14,4,...,0,0,0,0,0,1,0,0,0,1
the,0,0,0,26,30,8,13,8,3,13,...,2,0,1,1,1,0,1,1,0,0
",",0,0,0,2,5,13,16,9,9,4,...,0,1,1,0,0,0,0,0,0,0
you,0,0,0,20,16,8,9,2,5,6,...,0,0,0,0,0,0,1,1,1,0
a,0,0,0,14,14,3,9,5,6,6,...,0,0,0,0,0,0,0,0,0,0
to,0,0,0,8,4,13,4,6,6,0,...,0,0,0,0,0,0,0,0,0,0


In [186]:
def PPMI(C):
    """Tranform a counts matrix to PPMI.
    
    Args:
      C: scipy.sparse.csc_matrix of counts C_ij
    
    Returns:
      (scipy.sparse.csc_matrix) PPMI(C) as defined above
    """
    # Total count.
    Z = float(C.sum())

    # Sum each row (along columns).
    Zr = np.array(C.sum(axis=1), dtype=np.float64).flatten()
    
    # Get indices of relevant elements.
    ii, jj = C.nonzero()  # row, column indices
    Cij = np.array(C[ii,jj], dtype=np.float64).flatten()
    
    # PMI equation.
    pmi = np.log(Cij * Z / (Zr[ii] * Zr[jj]))

    # Truncate to positive only.
    ppmi = np.maximum(0, pmi)  # take positive only
    
    # Re-format as sparse matrix.
    ret = scipy.sparse.csc_matrix((ppmi, (ii,jj)), shape=C.shape,
                                  dtype=np.float64)
    ret.eliminate_zeros()  # remove zeros
    return ret

# Display the PPMI'd version of the co-occurrence matrix.
pretty_print_matrix(PPMI(C).toarray(), rows=labels, 
                    cols=labels, dtype=float)

Unnamed: 0,<s>,</s>,<unk>,.,Unnamed: 5,the,",",you,a,to,...,wins,men,women,wear,same,shoes,woman,whom,she,speaking
<s>,0.0000,0.0000,0.0000,1.6542,1.6558,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,6.2577
</s>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
<unk>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
.,1.6542,0.0000,0.0000,0.0000,0.9644,0.0760,0.0000,0.1501,0.0000,0.0000,...,0.9610,0.0000,0.0000,0.0000,0.0000,0.9610,0.0000,0.0000,0.9610,0.9610
,1.6558,0.0000,0.0000,0.9644,0.0000,0.2208,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.9627,0.0000,0.0000,0.0000,0.9627
the,0.0000,0.0000,0.0000,0.0760,0.2208,0.0000,0.0000,0.0000,0.0000,0.4028,...,2.1145,0.0000,1.4214,1.4214,1.4214,0.0000,1.4214,1.4214,0.0000,0.0000
",",0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.3591,0.0000,0.1092,0.0000,...,0.0000,1.5755,1.5755,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
you,0.0000,0.0000,0.0000,0.1501,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.7579,1.7579,1.7579,0.0000
a,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.1092,0.0000,0.0292,0.1092,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
to,0.0000,0.0000,0.0000,0.0000,0.0000,0.4028,0.0000,0.0000,0.1092,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


## Question 2
Briefly describe the algorithm for forming the PPMI matrix. What is the time complexity of your algorithm? Write at least 50 words.

In the PPMI algorithm, we first want to get the pairwise and singleton probabilities. To do this, we get the total number of tokens and the total count of tokens. To calculate the PPMI, we use the formula: 

PPMI = max (0, log ( P(word1, word2) / P (word1) * P (word2) ) )

We take the max of the PMI and truncate to ignore negative values. We would result in a matrix which contains the PMI's of associated words and zero values for those which do not associate. I believe the time complexity would be O(N^2)?

## Question 3
How would you test if the representation of the words in your PPMI matrix reflects some fact about the relationship between words in the real world? For example, if two words are expected to co-occur together a lot, the PPMI value should be high (and vice versa). Write at least 50 words in your answer and give at least 2 pairs of examples from your PPMI matrix.


One pair of words I found to have a high PPMI in the matrix was "whom" and "woman". This seems accurate as it can be used in a lot of contexts saying "a woman whom..". Another pair that shows up with a high PPMI in the matrix is both "highest" and "wins" and "score" and "wins". These all correlate when talking about highest scores, winning due to high scores, etc. To test the representation of words in the matrix and words in the real world, words can be tested that show up only in situations together. For example, in the video it describes how "San Francisco" would most likely only show up in the same contexts together, as they form the name of a city. 

## Question 4

Repeat Question 1 for the first 100 sentences in the Italian language data file.

In [185]:
itcorpus = []
for sentence in itSent:
    words += sentence.split(" ")
    itcorpus.append(sentence.split(" "))
    
# Get vocab, tokens, and token_ids as above.
itvocab = Vocabulary(canonicalize_word(w)
                       for w in flatten(itcorpus))
ittokens = preprocess_sentences(itcorpus, itvocab,
                                  use_eos=False, emit_ids=False)
ittoken_ids = vocab.words_to_ids(ittokens)

# Build the co-occurrence matrix.
itC = co_occurrence_matrix(ittoken_ids, itvocab.size, K=3)

# Display a table with the counts. The .toarray() function converts the
# sparse matrix into a dense one.
itlabels = itvocab.ordered_words()
pretty_print_matrix(itC.toarray(), rows=itlabels,
                    cols=itlabels, dtype=int)



Counting pairs (i, i ± 1) ...
Counting pairs (i, i ± 2) ...
Counting pairs (i, i ± 3) ...
Co-occurrence matrix: 723 words x 723 words
  68 nonzero elements


Unnamed: 0,<s>,</s>,<unk>,Unnamed: 4,.,di,",",e,la,un,...,felicità,conigli,hanno,ottimo,udito,un',ottima,individuare,predatori,facilmente
<s>,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
</s>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
<unk>,1,0,6138,431,480,0,228,0,84,0,...,0,0,0,0,0,0,0,0,0,0
,1,0,431,0,91,0,3,0,7,0,...,0,0,0,0,0,0,0,0,0,0
.,1,0,480,91,0,0,4,0,7,0,...,0,0,0,0,0,0,0,0,0,0
di,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
",",0,0,228,3,4,0,4,0,3,0,...,0,0,0,0,0,0,0,0,0,0
e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
la,0,0,84,7,7,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
un,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
# Display the PPMI'd version of the co-occurrence matrix. - Italian
pretty_print_matrix(PPMI(itC).toarray(), rows=itlabels, 
                    cols=itlabels, dtype=float)

Unnamed: 0,<s>,</s>,<unk>,Unnamed: 4,.,di,",",e,la,un,...,felicità,conigli,hanno,ottimo,udito,un',ottima,individuare,predatori,facilmente
<s>,0.0000,0.0000,0.0000,1.7319,1.6391,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
</s>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
<unk>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.1279,0.0000,0.0097,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
,1.7319,0.0000,0.0000,0.0000,0.9478,0.0000,0.0000,0.0000,0.1514,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
.,1.6391,0.0000,0.0000,0.9478,0.0000,0.0000,0.0000,0.0000,0.0586,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
di,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
",",0.0000,0.0000,0.1279,0.0000,0.0000,0.0000,0.0000,0.0000,0.0996,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
e,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
la,0.0000,0.0000,0.0097,0.1514,0.0586,0.0000,0.0996,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
un,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
