# Counter Vectorizer

## The problem

Discriminates documents from a list between "sports", "arts", and "science" labels

In [17]:
# Documents

sports = [
    "The New York Yankees won the World Series.",
    "The Golden State Warriors won the NBA championship.",
    "Usain Bolt won the gold medal in the 100-meter dash.",
    "Serena Williams won the Wimbledon women's singles championship.",
    "Lionel Messi won the Ballon d'Or award.",
]

science = [
    "Scientists discovered a new planet outside of our solar system.",
    "Researchers developed a new vaccine for malaria.",
    "Engineers created a new type of solar cell that is more efficient.",
    "Physicists found a new way to create fusion energy.",
    "Biologists sequenced the genome of the human microbiome.",
]

art = [
    "Pablo Picasso painted Guernica.",
    "Vincent van Gogh painted The Starry Night.",
    "Michelangelo sculpted David.",
    "Leonardo da Vinci painted the Mona Lisa.",
    "Frida Kahlo painted Self-Portrait with Thorn Necklace and Hummingbird.",
]

documents = sports + science + art

sports_words = [
    "won",
    "championship",
    "medal",
]

science_words = [
    "discovered",
    "energy",
    "planet",
    "cell",
    "vaccine",
]

art_words = [
    "painted",
    "sculpted",
]


vocabulary = sports_words + science_words + art_words

In [18]:
# Counting the number of occurrences of each word in the documents
from collections import Counter
from scipy.sparse import lil_matrix
import numpy as np
import string

bag_words = lil_matrix((len(documents), len(vocabulary)))

def get_words(text: str) -> list:
    punctuation = string.punctuation
    clean_text = text.translate(str.maketrans('', '', punctuation))
    return clean_text.lower().split()

for index, document in enumerate(documents):
    counter = Counter(get_words(document))
    bag_words[index,:] = [counter[word] for word in vocabulary]

bag_words = bag_words.tocsr()
print(bag_words.toarray())
    
    

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
