# Explore different options for document vectorization

In [None]:
from yelp.loader import load_sample, stars
import pandas as pd

In [None]:
limit = 3_000
data = list(stars(load_sample()))[:limit]
documents, y = list(zip(*data))

## Tokenization and normalization

### Example 1 (SpaCy)
- simple string cleaning
- lowercase
- lemmatization
- POS filtering

In [None]:
import spacy
import sys 
sys.path.append('../nlp')
from nlp.vectorize import spacy_tokenizer

In [None]:
nlp = spacy.load('en_core_web_lg')

**Example**

In [None]:
text = documents[10]
print(text)
print(spacy_tokenizer(nlp, text, lowercase=True, lemma=True, pos_filter=['PUNCT', 'DET']))

#### Count Vectorizer
From [sklearn CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [None]:
from nlp.vectorize import spacy_count_vectorizer

In [None]:
X, vectorizer = spacy_count_vectorizer(nlp, documents, lowercase=True, lemma=True, pos_filter=['PUNCT', 'DET'], min_df=2)

In [None]:
features = vectorizer.get_feature_names_out()
print(len(features))
print(features[:10])

In [None]:
Xdf = pd.DataFrame(X.toarray(), columns=features)
Xdf.head()

In [None]:
test_doc = 10
print(documents[test_doc])
print([(w, score) for w, score in Xdf.iloc[test_doc].sort_values(ascending=False).head(20).items()])

### Example 2 (word_tokenizer)
- simple string cleaning
- lowercase
- no lemmatization
- no POS filtering

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
simple_tokenizer = lambda x: [w.lower() for w in word_tokenize(x)]

In [None]:
vectorizer = CountVectorizer(tokenizer=simple_tokenizer, token_pattern=None, min_df=2)
W = vectorizer.fit_transform(documents)
Wdf = pd.DataFrame(W.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
Wdf 

## Features exploration

Let's define the notion of document frequency as:
$$
df(w) = \mid \{d: w \in d\} \mid
$$

In [None]:
df = Wdf.astype(bool).sum(axis=0)

#### Inverse document frequency

In [None]:
import numpy as np 

In [None]:
df / Wdf.shape[0]

In [None]:
idf = np.log(Wdf.shape[0] / df)

In [None]:
idf.sort_values(ascending=True)

#### Plot words in the space of documents by showing their DF

In [None]:
from sklearn.manifold import TSNE 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
tsne = TSNE(n_components=2)
W2d = tsne.fit_transform(Wdf.T)

In [None]:
high_df_words_indexes = [i for i, (k, w) in enumerate(df.items()) if w > 300]
low_df_words_indexes = [i for i, (k, w) in enumerate(df.items()) if 20 < w < 300]

In [None]:
fig, ax = plt.subplots(figsize=(12, 8), ncols=3, nrows=2)
sns.scatterplot(x=W2d[:,0], y=W2d[:,1], ax=ax[0, 0], alpha=.2, hue=df, palette="rocket", size=df)
sns.scatterplot(x=W2d[high_df_words_indexes,0], y=W2d[high_df_words_indexes,1], 
                ax=ax[0, 1], alpha=.2, hue=df.values[high_df_words_indexes], palette="rocket", size=df.values[high_df_words_indexes])
sns.scatterplot(x=W2d[low_df_words_indexes,0], y=W2d[low_df_words_indexes,1], 
                ax=ax[0, 2], alpha=.2, hue=df.values[low_df_words_indexes], palette="rocket", size=df.values[low_df_words_indexes])
ax[0, 0].set_title('All words')
ax[0, 1].set_title('DF > 300')
ax[0, 2].set_title('50 < DF < 300')
leg = ax[0, 0].get_legend()
leg.set_title("")

sns.lineplot(x=range(df.shape[0]), y=df.sort_values(ascending=False).values, ax=ax[1, 0], color='#cc0000')
sns.lineplot(x=range(df.shape[0]), y=df.sort_values(ascending=False).values, ax=ax[1, 1], color='#cc0000')
sns.scatterplot(x=Wdf.mean(axis=0), y=df, ax=ax[1, 2], alpha=.6, color='#cc0000')
ax[1, 1].set_yscale('log')
ax[1, 0].set_xlabel('Words')
ax[1, 0].set_ylabel('DF')
ax[1, 1].set_xlabel('Words')
ax[1, 1].set_ylabel('DF (log)')
ax[1, 2].set_xlabel('Occurrences per document (mean)')
ax[1, 2].set_ylabel('DF')
plt.tight_layout()
plt.show()

In [None]:
test_document = 10
print(documents[test_document])

In [None]:
Wdf.iloc[test_document].sort_values(ascending=False)

In [None]:
data = {}
for w, s in Wdf.iloc[test_document].items():
    if s > 0:
        data[w] = s * idf[w]
tfidf = pd.Series(data)

In [None]:
tfidf.sort_values(ascending=False)

## Count VS TdfIdf vectorizers
Lets' check the different effect of **TF** VS **TfIdf** on the classification tasks

In [8]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from yelp.loader import load_sample, stars
import pandas as pd
import re 

In [9]:
limit = 10_000
data = list(stars(load_sample()))[:limit]
documents, y = list(zip(*data))

In [None]:
def tokenizer(x):
    t = re.sub("\s\s+" , " ", x)
    t = re.sub("[\n]+", " ", t)
    t = re.sub("[\r\n]+", " ", t)
    return word_tokenize(t)

In [None]:
count_vectorizer = CountVectorizer(tokenizer=tokenizer, token_pattern=None, min_df=2)
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer, token_pattern=None, min_df=2)

In [None]:
C = count_vectorizer.fit_transform(documents)
T = tfidf_vectorizer.fit_transform(documents)

In [None]:
print(C.shape, T.shape)

In [None]:
C_train, C_test, c_train, c_test = train_test_split(C, y)
T_train, T_test, t_train, t_test = train_test_split(T, y)

#### Classification test
We use **Born** in order to explore feature selection

In [18]:
from bornrule import BornClassifier

In [19]:
born_c = BornClassifier()
born_c.fit(C_train, c_train)
c_pred = born_c.predict(C_test)

In [20]:
born_t = BornClassifier()
born_t.fit(T_train, t_train)
t_pred = born_t.predict(T_test)

#### Evaluation and explanation

In [21]:
from sklearn.metrics import classification_report

In [22]:
print('Count Vectorizer')
print(classification_report(c_test, c_pred, zero_division=0))
print('TfIdf Vectorizer')
print(classification_report(t_test, t_pred, zero_division=0))

Count Vectorizer
              precision    recall  f1-score   support

           1       0.63      0.81      0.71       362
           2       0.41      0.22      0.29       202
           3       0.48      0.20      0.28       294
           4       0.53      0.27      0.35       579
           5       0.65      0.92      0.76      1063

    accuracy                           0.61      2500
   macro avg       0.54      0.48      0.48      2500
weighted avg       0.58      0.61      0.56      2500

TfIdf Vectorizer
              precision    recall  f1-score   support

           1       0.60      0.83      0.69       337
           2       0.40      0.24      0.30       222
           3       0.36      0.22      0.27       286
           4       0.48      0.34      0.40       604
           5       0.68      0.83      0.75      1051

    accuracy                           0.59      2500
   macro avg       0.50      0.49      0.48      2500
weighted avg       0.56      0.59      0.56

In [23]:
E = pd.DataFrame(born_c.explain().toarray(), index=count_vectorizer.get_feature_names_out(), columns=range(1, 6))

In [27]:
E#.sort_values(5, ascending=False).head(20)

Unnamed: 0,1,2,3,4,5
!,0.012075,0.007701,0.008109,0.011959,0.017360
#,0.000547,0.000291,0.000574,0.000516,0.000532
$,0.001977,0.002233,0.001842,0.001674,0.001215
%,0.000384,0.000349,0.000269,0.000270,0.000357
&,0.000793,0.000685,0.000765,0.000881,0.001060
...,...,...,...,...,...
étions,0.000000,0.003869,0.000000,0.000000,0.000000
étoiles,0.000000,0.004887,0.000000,0.000000,0.000000
été,0.001286,0.000000,0.000000,0.001499,0.000761
être,0.000000,0.000000,0.000000,0.001086,0.000881


**Mutual Information**

In [10]:
import nltk 
from collections import defaultdict
import numpy as np 

In [11]:
def bigrams(text):
    tokens = word_tokenize(text.lower())
    return tokens, list(nltk.ngrams(tokens, n=2))

def counter(corpus):
    unigram, bigram = defaultdict(lambda: 0), defaultdict(lambda: 0)
    for text in corpus:
        tokens, bigrams_ = bigrams(text)
        for token in tokens:
            unigram[token] += 1
        for bi in bigrams_:
            bigram[bi] += 1
    return unigram, bigram

In [12]:
U, B = counter(documents)

In [13]:
U = pd.Series(U)
B = pd.Series(B)

In [27]:
mu = defaultdict(lambda: 0)
b_total = B.sum()
u_total = U.sum()
for (x, y), count in B.items():
    if count < 10 or count > 1000:
        pass 
    else:
        if U[x] > 2 and U[y] > 2:
            p_b = count / b_total
            p_x = U[x] / u_total
            p_y = U[y] / u_total
            mu[(x, y)] = p_b * np.log(p_b / (p_x * p_y))

In [28]:
MU = pd.Series(mu)

In [29]:
for b, m in MU.sort_values(ascending=False).head(100).items():
    print(b, m)

('ca', "n't") 0.003132475752508614
('you', 'can') 0.0026713143708323147
('as', 'well') 0.002647506096591929
('a', 'little') 0.0025655714144917955
('have', 'been') 0.002505590545834812
('i', 'am') 0.002444293148099181
('you', "'re") 0.0023766671837774766
('a', 'few') 0.002336602034943331
('customer', 'service') 0.0023290891306705246
('they', 'were') 0.002293980461441209
('a', 'lot') 0.0021172697221911153
('they', 'have') 0.0020615338788516095
('to', 'go') 0.0019992113238811752
('we', 'had') 0.0019621656170151444
('was', 'very') 0.0019598608811676878
('highly', 'recommend') 0.0019119761592289318
('when', 'i') 0.0018217573515197539
('a', 'bit') 0.0017996985759788683
("'ve", 'been') 0.0017519001741248355
('they', 'are') 0.0017466979337219112
('will', 'be') 0.0017204536093814824
('a', 'great') 0.0016446674696337692
('ice', 'cream') 0.0016129710151046469
('place', 'is') 0.001599425498537736
('wo', "n't") 0.001596063603480355
('first', 'time') 0.0015702445222478556
('at', 'least') 0.001549189

In [23]:
B[('.', 'i')]

9581