In [1]:
#Text Mining and Bag of Words

#This notebook demonstrates basic text mining concepts and the Bag of Words representation to convert textual data into numerical features.

In [2]:
### Problem Statement

#The objective of this task is to understand how textual data can be converted into numerical form using the Bag of Words model, which can then be used in machine learning algorithms.


In [3]:
documents = [
    "I love data science",
    "Data science is very interesting",
    "I love machine learning",
    "Machine learning is part of data science"
]

In [4]:
#Tokenization

In [5]:
tokenized_docs = [doc.lower().split() for doc in documents]
tokenized_docs

[['i', 'love', 'data', 'science'],
 ['data', 'science', 'is', 'very', 'interesting'],
 ['i', 'love', 'machine', 'learning'],
 ['machine', 'learning', 'is', 'part', 'of', 'data', 'science']]

In [6]:
#Creating Vocabulary

In [7]:
vocabulary = sorted(set(word for doc in tokenized_docs for word in doc))
vocabulary

['data',
 'i',
 'interesting',
 'is',
 'learning',
 'love',
 'machine',
 'of',
 'part',
 'science',
 'very']

In [8]:
import pandas as pd

bow_matrix = []

for doc in tokenized_docs:
    row = []
    for word in vocabulary:
        row.append(doc.count(word))
    bow_matrix.append(row)

bow_df = pd.DataFrame(bow_matrix, columns=vocabulary)
bow_df

Unnamed: 0,data,i,interesting,is,learning,love,machine,of,part,science,very
0,1,1,0,0,0,1,0,0,0,1,0
1,1,0,1,1,0,0,0,0,0,1,1
2,0,1,0,0,1,1,1,0,0,0,0
3,1,0,0,1,1,0,1,1,1,1,0


In [9]:
### Conclusion

#The Bag of Words model provides a simple yet effective way to convert text into numerical form. Although it ignores word order and context, it serves
#as a foundation for many text mining and natural language processing tasks.
