# Tutorial 01: Implementing Bag-of-Words Model for Information Retrieval

## Step 1: Preprocessing

### importing the necessary libraries

In [1]:
import nltk
from nltk.tokenize import word_tokenize

### download the required resources from the NLTK library

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### define a function for preprocessing a single document

In [5]:
def preprocess_document(document):
    # Tokenize the document
    tokens = word_tokenize(document)
    
    # Remove punctuation and convert to lowercase
    tokens = [ token.lower() for token in tokens if token.isalpha()]
    
    return tokens

## Step 2: Creating the Bag-of-Words Representation

In [6]:
def create_bow(document, vocabulary):
    bow_vector = [0] * len(vocabulary)
    for token in document:
        if token in vocabulary:
            bow_vector[vocabulary.index(token)] += 1
    return bow_vector

In [7]:
def create_vocabulary(documents):
    vocabulary = set()
    for document in documents:
        vocabulary.update(document)
        
    return sorted(list(vocabulary))

## step 3: Implementation  

In [8]:
# Sample documents
documents = [
    "I love natural language processing",
    "Information retrieval is an interesting topic",
    "Natural language processing and information retrieval are related",
    "I enjoy working on information retrieval projects"
]

In [13]:
#Preprocess the documents
preprocessed_documents = [preprocess_document(doc) for doc in documents]
print(preprocessed_documents)
# Create the vocabulary
vocabulary = create_vocabulary(preprocessed_documents)
# Create the BoW representation for each document
bow_vectors = [create_bow(doc, vocabulary) for doc in preprocessed_documents]
print(bow_vectors)

[['i', 'love', 'natural', 'language', 'processing'], ['information', 'retrieval', 'is', 'an', 'interesting', 'topic'], ['natural', 'language', 'processing', 'and', 'information', 'retrieval', 'are', 'related'], ['i', 'enjoy', 'working', 'on', 'information', 'retrieval', 'projects']]
[[0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1]]


In [16]:
import pandas as pd 
df = pd.DataFrame(bow_vectors)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,0
1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,1,0
2,0,1,1,0,0,1,0,0,1,0,1,0,1,0,1,1,0,0
3,0,0,0,1,1,1,0,0,0,0,0,1,0,1,0,1,0,1
