# Vector Space Models

* **Name:** Aldo Barriente
* **Course:** DS 5001
* **Instructor:** Professor Rafael Alvarado

## Set up

In [6]:
data_in = './data_in/'
data_out = './data_out/'
data_prefix = 'zapatistas'


In [7]:
import pandas as pd
import numpy as np

## Importing `LIB`

In [22]:
LIB = pd.read_csv(f'{data_in}/{data_prefix}-LIB.csv')

## Importing `TOKEN`

In [8]:
TOKEN = pd.read_csv(f'{data_in}/{data_prefix}-TOKEN.csv')
OHCO = TOKEN.columns.to_list()[:5]
TOKEN = TOKEN.set_index(OHCO)

In [26]:
SENTS = OHCO[:4]
PARAS = OHCO[:3]
SECS = OHCO[:2]
TEXTS = OHCO[:1]

## Importing `VOCAB`

In [10]:
VOCAB = pd.read_csv(f'{data_in}/{data_prefix}-VOCAB.csv').dropna().set_index('term_str')

## Adding basic statistics to `VOCAB` table

In [13]:
VOCAB['term_code'] = VOCAB.apply(lambda x: x.name + '/' + x.pos_max, 1)

In [15]:
N_vocab = VOCAB.shape[0]
U_vocab = 1/N_vocab

In [16]:
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()  # Probability
VOCAB['s'] = 1 / VOCAB.p              # Surprise
VOCAB['i'] = np.log2(VOCAB.s)         # Information
VOCAB['h'] = VOCAB.p * VOCAB.i        # Entropy

In [17]:
VOCAB['wlen'] = VOCAB.index.str.len()

In [18]:
H_vocab = round(VOCAB.h.sum(), 2)
W_len = round(VOCAB.wlen.mean(), 2)
R_vocab = round((1 - H_vocab / np.log2(VOCAB.shape[0])) * 100, 2)
R_wlen = round((1 - W_len / np.log2(VOCAB.shape[0])) * 100, 2)
HX_vocab = round(U_vocab * VOCAB.i.sum(), 2)

## Adding `term_rank` and Zipf's k to each term

In [19]:
VOCAB = VOCAB.sort_values('n', ascending=False)
VOCAB['term_rank'] = [r+1 for r in range(VOCAB.shape[0])]

In [20]:
VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
ZK = round(VOCAB.zipf_k.mean(), 2)

In [21]:
VOCAB['zipf_y'] = ZK / VOCAB.term_rank

## `TFIDF` Function

In [24]:
def get_tfidf(tokens, vocab, bag, tf_type='n', item_type='term_str', alpha=.4, new_col_suffix=''):
    
    # Create BOW
    BOW = tokens.groupby(bag+[item_type])[item_type].count()\
        .to_frame('n')
    BOW['c'] = 1
    
    # Compute TF
    D = BOW.groupby(bag).n
    if tf_type == 'n':
        BOW['tf'] = BOW.n
    elif tf_type == 'sum':
        BOW['tf'] = D.apply(lambda x: x / x.sum()) # cp = P(w|d)
    elif tf_type == 'l2':
        BOW['tf'] = D.apply(lambda x: x / np.sqrt((x**2).sum()))
    elif tf_type == 'max':
        BOW['tf'] = D.apply(lambda x: alpha + (1-alpha) * (x / x.max()))
    elif tf_type == 'log':
        BOW['tf'] = D.apply(lambda x: np.log2(1 + x))
    elif tf_type == 'sub':
        BOW['tf'] = D.apply(lambda x: 1 + np.log2(x))
    elif tf_type == 'bool':
        BOW['tf'] = BOW.c
    elif tf_type == 'bool2':
        BOW['tf'] = D.apply(lambda x: 1 / len(x))
    
    # Normalize TF
    
    # Compute IDF
    vocab['df'] = BOW.groupby('term_str').n.count()
    N_docs = len(D.groups)
    vocab['idf'] = np.log2(N_docs/vocab.df)
    
    # Compute TFIDF
    BOW['tfidf'] = BOW.tf * vocab.idf
    
    # Compute aggregate TFIDF
    col = 'tfidf_sum' + new_col_suffix
    vocab[col] = BOW.groupby(item_type)['tfidf'].sum()
    
#     vocab[col] = (vocab[col] - vocab[col].mean()) / vocab[col].std()
#     vocab[col] = vocab[col] - vocab[col].min()
    
    return vocab

## Top words by bag

### By texts

In [30]:
VOCAB = get_tfidf(TOKEN, VOCAB, bag=TEXTS, tf_type='max', new_col_suffix='_text_max', alpha=0)

In [31]:
VOCAB['tfidf_sum_text_max'].sort_values(ascending=False).head(20)

term_str
q             0.231874
votán         0.156191
boys          0.103164
april         0.093273
she           0.081875
guardian      0.081814
m             0.077948
thats         0.074018
law           0.067266
girls         0.061228
addressees    0.060179
beetle        0.059501
resist        0.058690
theyre        0.055799
newspaper     0.054907
convention    0.054495
tobacco       0.053866
morquecho     0.052679
revolution    0.049517
tender        0.047707
Name: tfidf_sum_text_max, dtype: float64

### By sections

In [32]:
VOCAB = get_tfidf(TOKEN, VOCAB, bag=SECS, tf_type='max', new_col_suffix='_secs_max', alpha=0)

In [33]:
VOCAB['tfidf_sum_secs_max'].sort_values(ascending=False).head(20)

term_str
i           8.024289
we          4.333476
he          4.101807
was         4.062942
us          3.852278
you         3.713446
your        3.458170
they        3.402461
sisters     3.254994
brothers    3.253932
there       3.241337
were        3.125123
dialogue    3.091010
law         3.055964
q           2.970630
or          2.959872
but         2.927209
what        2.918446
because     2.842126
mr          2.834232
Name: tfidf_sum_secs_max, dtype: float64

### By paragraphs

In [34]:
VOCAB = get_tfidf(TOKEN, VOCAB, bag=PARAS, tf_type='max', new_col_suffix='_paras_max', alpha=0)

In [35]:
VOCAB['tfidf_sum_paras_max'].sort_values(ascending=False).head(20)

term_str
the     776.028383
to      749.063612
we      734.768893
you     729.274686
and     689.418311
of      683.136032
that    666.446760
a       601.800394
in      589.271990
is      525.756475
are     518.938764
they    512.388466
for     497.112083
our     471.630890
it      455.551066
have    452.831023
will    447.619703
q       423.811232
1994    422.147424
be      421.485241
Name: tfidf_sum_paras_max, dtype: float64

## Exporting

In [40]:
TOKEN.to_csv(f'{data_out}/{data_prefix}-TOKEN.csv')
LIB.to_csv(f'{data_out}/{data_prefix}-LIB.csv')
VOCAB.to_csv(f'{data_out}/{data_prefix}-VOCAB.csv')