In [1]:
documents = [
    "Natural language processing is fun",
    "I love learning NLP",
    "NLP helps computers understand language"
]


In [2]:
import numpy as np
import pandas as pd


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(documents)

bow_df = pd.DataFrame(
    bow_counts.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

print("Bag-of-Words (Count Occurrence):")
bow_df


Bag-of-Words (Count Occurrence):


Unnamed: 0,computers,fun,helps,is,language,learning,love,natural,nlp,processing,understand
0,0,1,0,1,1,0,0,1,0,1,0
1,0,0,0,0,0,1,1,0,1,0,0
2,1,0,1,0,1,0,0,0,1,0,1


In [4]:
normalized_bow = bow_counts.toarray() / bow_counts.toarray().sum(axis=1, keepdims=True)

normalized_bow_df = pd.DataFrame(
    normalized_bow,
    columns=count_vectorizer.get_feature_names_out()
)

print("Normalized Bag-of-Words:")
normalized_bow_df


Normalized Bag-of-Words:


Unnamed: 0,computers,fun,helps,is,language,learning,love,natural,nlp,processing,understand
0,0.0,0.2,0.0,0.2,0.2,0.0,0.0,0.2,0.0,0.2,0.0
1,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.333333,0.0,0.0
2,0.2,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.2


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("TF-IDF Matrix:")
tfidf_df


TF-IDF Matrix:


Unnamed: 0,computers,fun,helps,is,language,learning,love,natural,nlp,processing,understand
0,0.0,0.467351,0.0,0.467351,0.355432,0.0,0.0,0.467351,0.0,0.467351,0.0
1,0.0,0.0,0.0,0.0,0.0,0.622766,0.622766,0.0,0.47363,0.0,0.0
2,0.490479,0.0,0.490479,0.0,0.373022,0.0,0.0,0.0,0.373022,0.0,0.490479


In [7]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
print(tokenized_docs)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


[['natural', 'language', 'processing', 'is', 'fun'], ['i', 'love', 'learning', 'nlp'], ['nlp', 'helps', 'computers', 'understand', 'language']]


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [9]:
!pip install gensim
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=50,
    window=5,
    min_count=1,
    workers=2
)

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [10]:
print("Word2Vec embedding for 'nlp':")
w2v_model.wv['nlp']


Word2Vec embedding for 'nlp':


array([-1.0724545e-03,  4.7286271e-04,  1.0206699e-02,  1.8018546e-02,
       -1.8605899e-02, -1.4233618e-02,  1.2917745e-02,  1.7945977e-02,
       -1.0030856e-02, -7.5267432e-03,  1.4761009e-02, -3.0669428e-03,
       -9.0732267e-03,  1.3108104e-02, -9.7203208e-03, -3.6320353e-03,
        5.7531595e-03,  1.9837476e-03, -1.6570430e-02, -1.8897636e-02,
        1.4623532e-02,  1.0140524e-02,  1.3515387e-02,  1.5257311e-03,
        1.2701781e-02, -6.8107317e-03, -1.8928028e-03,  1.1537147e-02,
       -1.5043275e-02, -7.8722071e-03, -1.5023164e-02, -1.8600845e-03,
        1.9076237e-02, -1.4638334e-02, -4.6675373e-03, -3.8754821e-03,
        1.6154874e-02, -1.1861792e-02,  9.0324880e-05, -9.5074680e-03,
       -1.9207101e-02,  1.0014586e-02, -1.7519170e-02, -8.7836506e-03,
       -7.0199967e-05, -5.9236289e-04, -1.5322480e-02,  1.9229487e-02,
        9.9641159e-03,  1.8466286e-02], dtype=float32)