[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/boruizhang/representations/blob/main/notebook.ipynb)

### Bag-of-Words (BoW), one of the simplest and most interpretable text representations. Text data can be converted into numerical representations and each sentence becomes a vector of word counts.


In [None]:
# https://scikit-learn.org/stable/user_guide.html
%pip install scikit-learn

In [None]:
docs = ["Dog bites man.",
        "Man bites dog.",
        "Dog eats meat.",
        "Man eats food.",
        "Cat likes milk and fish.",
        "Dog likes meat and food."
        ]

processed_docs = [doc.lower().replace(".","") for doc in docs]

#look at the documents list
print("Our corpus: ", processed_docs)


## We use CountVectorizer from sklearn to learn a vocabulary and count word occurrences

In [None]:
"""
  Build a BOW representation for the corpus:
  Step 1: scans all documents, collects unique words, builds the vocabulary, assigns column indices
  Step 2: Uses the learned vocabulary produces a document Ã— term matrix
  In short, preprocessing 'texts --> numbers'
"""
#https://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer() # create CountVector object, not vocab yet, learn vocab later in .fit
bow_rep = count_vect.fit_transform(processed_docs) # two steps in one line: fit(processed_docs) and transform(processed_docs)
bow_rep.shape   # dimension check

In [None]:
import matplotlib.pyplot as plt

plt.imshow(bow_rep.toarray())
plt.colorbar()
plt.xlabel("Word index")
plt.ylabel("Document index")
plt.title("Bag-of-Words Matrix")
plt.show()

In [None]:
#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

In [None]:
#Look at word and index mapping
for i, word in enumerate(count_vect.get_feature_names_out()): # get all vocab from in column order while adding index
    print(i, word)

In [None]:
#See the BOW rep for the first and last documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray()) #sparse matrix to array
print("BoW representation for 'dog likes meat and food: ",bow_rep[-1].toarray())

#Get the representation using this vocabulary, for a new text within the vacabulary range
temp = count_vect.transform(["dog and dog eats fish and food"])
print("Bow representation for 'dog and dog eats fish and food':", temp.toarray())

#Get the representation using this vocabulary, for a new text outside of the vacabulary range
temp = count_vect.transform(["human and animal are friends"])
print("Bow representation for 'human and animal are friends':", temp.toarray())

#TF-IDF (Term Frequency-Inverse Document Frequency)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns

tfidf_vect = TfidfVectorizer()
tfidf_rep = tfidf_vect.fit_transform(processed_docs)

print(f"TF-IDF Matrix Shape: {tfidf_rep.shape}")
print("TF-IDF vocabulary: ", tfidf_vect.vocabulary_)


In [None]:
# TF-IDF Heatmap via seaborn https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize=(12, 6))
sns.heatmap(tfidf_rep.toarray(),
            annot=True,
            fmt='.2f',
            cmap='viridis',
            xticklabels=tfidf_vect.get_feature_names_out(),
            yticklabels=[f"Doc {i+1}" for i in range(len(docs))], #f-string
            cbar_kws={'label': 'TF-IDF Score'})
plt.xlabel("Words", fontsize=12)
plt.ylabel("Documents", fontsize=12)
plt.title("TF-IDF Matrix Heatmap", fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



In [None]:
# Test with exist text
print("\nTF-IDF for 'dog bites man': ", tfidf_rep[0].toarray())
print("TF-IDF for 'dog likes meat and food': ", tfidf_rep[-1].toarray())


# Test with new text
temp_tfidf = tfidf_vect.transform(["dog and dog eats fish and food"])
print("\nTF-IDF for 'dog and dog eats fish and food':", temp_tfidf.toarray())

temp_tfidf = tfidf_vect.transform(["human and animal are friends"])
print("TF-IDF for 'human and animal are friends':", temp_tfidf.toarray())

In [None]:
#checking document similarity
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


sim_matrix = cosine_similarity(tfidf_rep)
print("Similarity matrix shape:", sim_matrix.shape)
print("\nSimilarity raw matrix:\n", sim_matrix)

In [None]:
#optionally make the display nicer
labels = [f"Doc {i+1}" for i in range(len(processed_docs))]
sim_df = pd.DataFrame(sim_matrix, index=labels, columns=labels)
print("\nSimilarity table:")
print(sim_df.round(3))

### These representations treat every word as independent. Later, we'll see how Skip-gram and CBOW learn vectors where similar words are close together.