# Explore different options for document vectorization

In [1]:
from yelp.loader import load_sample, stars
import pandas as pd

In [2]:
limit = 3_000
data = list(stars(load_sample()))[:limit]
documents, y = list(zip(*data))

## Tokenization and normalization

### Example 1 (SpaCy)
- simple string cleaning
- lowercase
- lemmatization
- POS filtering

In [3]:
import spacy
import sys 
sys.path.append('../nlp')
from nlp.vectorize import spacy_tokenizer

In [None]:
nlp = spacy.load('en_core_web_lg')

**Example**

In [None]:
text = documents[10]
print(text)
print(spacy_tokenizer(nlp, text, lowercase=True, lemma=True, pos_filter=['PUNCT', 'DET']))

#### Count Vectorizer
From [sklearn CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [4]:
from nlp.vectorize import spacy_count_vectorizer

In [None]:
X, vectorizer = spacy_count_vectorizer(nlp, documents, lowercase=True, lemma=True, pos_filter=['PUNCT', 'DET'], min_df=2)

In [None]:
features = vectorizer.get_feature_names_out()
print(len(features))
print(features[:10])

In [None]:
Xdf = pd.DataFrame(X.toarray(), columns=features)
Xdf.head()

In [None]:
test_doc = 10
print(documents[test_doc])
print([(w, score) for w, score in Xdf.iloc[test_doc].sort_values(ascending=False).head(20).items()])

### Example 2 (word_tokenizer)
- simple string cleaning
- lowercase
- no lemmatization
- no POS filtering

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
simple_tokenizer = lambda x: [w.lower() for w in word_tokenize(x)]

In [None]:
vectorizer = CountVectorizer(tokenizer=simple_tokenizer, token_pattern=None, min_df=2)
W = vectorizer.fit_transform(documents)
Wdf = pd.DataFrame(W.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
Wdf 

## Features exploration

Let's define the notion of document frequency as:
$$
df(w) = \mid \{d: w \in d\} \mid
$$

In [None]:
df = Wdf.astype(bool).sum(axis=0)

#### Inverse document frequency

In [None]:
import numpy as np 

In [None]:
df / Wdf.shape[0]

In [None]:
idf = np.log(Wdf.shape[0] / df)

In [None]:
idf.sort_values(ascending=True)

#### Plot words in the space of documents by showing their DF

In [None]:
from sklearn.manifold import TSNE 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
tsne = TSNE(n_components=2)
W2d = tsne.fit_transform(Wdf.T)

In [None]:
high_df_words_indexes = [i for i, (k, w) in enumerate(df.items()) if w > 300]
low_df_words_indexes = [i for i, (k, w) in enumerate(df.items()) if 20 < w < 300]

In [None]:
fig, ax = plt.subplots(figsize=(12, 8), ncols=3, nrows=2)
sns.scatterplot(x=W2d[:,0], y=W2d[:,1], ax=ax[0, 0], alpha=.2, hue=df, palette="rocket", size=df)
sns.scatterplot(x=W2d[high_df_words_indexes,0], y=W2d[high_df_words_indexes,1], 
                ax=ax[0, 1], alpha=.2, hue=df.values[high_df_words_indexes], palette="rocket", size=df.values[high_df_words_indexes])
sns.scatterplot(x=W2d[low_df_words_indexes,0], y=W2d[low_df_words_indexes,1], 
                ax=ax[0, 2], alpha=.2, hue=df.values[low_df_words_indexes], palette="rocket", size=df.values[low_df_words_indexes])
ax[0, 0].set_title('All words')
ax[0, 1].set_title('DF > 300')
ax[0, 2].set_title('50 < DF < 300')
leg = ax[0, 0].get_legend()
leg.set_title("")

sns.lineplot(x=range(df.shape[0]), y=df.sort_values(ascending=False).values, ax=ax[1, 0], color='#cc0000')
sns.lineplot(x=range(df.shape[0]), y=df.sort_values(ascending=False).values, ax=ax[1, 1], color='#cc0000')
sns.scatterplot(x=Wdf.mean(axis=0), y=df, ax=ax[1, 2], alpha=.6, color='#cc0000')
ax[1, 1].set_yscale('log')
ax[1, 0].set_xlabel('Words')
ax[1, 0].set_ylabel('DF')
ax[1, 1].set_xlabel('Words')
ax[1, 1].set_ylabel('DF (log)')
ax[1, 2].set_xlabel('Occurrences per document (mean)')
ax[1, 2].set_ylabel('DF')
plt.tight_layout()
plt.show()

In [None]:
test_document = 10
print(documents[test_document])

In [None]:
Wdf.iloc[test_document].sort_values(ascending=False)

In [None]:
data = {}
for w, s in Wdf.iloc[test_document].items():
    if s > 0:
        data[w] = s * idf[w]
tfidf = pd.Series(data)

In [None]:
tfidf.sort_values(ascending=False)

## Count VS TdfIdf vectorizers
Lets' check the different effect of **TF** VS **TfIdf** on the classification tasks

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import re 

In [None]:
limit = 10_000
data = list(stars(load_sample()))[:limit]
documents, y = list(zip(*data))

In [None]:
def tokenizer(x):
    t = re.sub("\s\s+" , " ", x)
    t = re.sub("[\n]+", " ", t)
    t = re.sub("[\r\n]+", " ", t)
    return word_tokenize(t)

In [None]:
count_vectorizer = CountVectorizer(tokenizer=tokenizer, token_pattern=None, min_df=2)
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer, token_pattern=None, min_df=2)

In [None]:
C = count_vectorizer.fit_transform(documents)
T = tfidf_vectorizer.fit_transform(documents)

In [None]:
print(C.shape, T.shape)

In [None]:
C_train, C_test, c_train, c_test = train_test_split(C, y)
T_train, T_test, t_train, t_test = train_test_split(T, y)

#### Classification test
We use **Born** in order to explore feature selection

In [None]:
from bornrule import BornClassifier

In [None]:
born_c = BornClassifier()
born_c.fit(C_train, c_train)
c_pred = born_c.predict(C_test)

In [None]:
born_t = BornClassifier()
born_t.fit(T_train, t_train)
t_pred = born_t.predict(T_test)

#### Evaluation and explanation

In [None]:
from sklearn.metrics import classification_report

In [None]:
print('Count Vectorizer')
print(classification_report(c_test, c_pred, zero_division=0))
print('TfIdf Vectorizer')
print(classification_report(t_test, t_pred, zero_division=0))

In [None]:
E = pd.DataFrame(born_c.explain().toarray(), index=count_vectorizer.get_feature_names_out(), columns=range(1, 6))

In [None]:
E.sort_values(5, ascending=False).head(20)