In [1]:
# !pip install gensim

In [2]:
from gensim.models import Word2Vec

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
corpus = ["Natural language processing is a field of artificial intelligence.",
         "It involves the interaction between computers and humans using natural language.",
         "NLP techniques are used in various applications like chatbots and language translation.",
         "Word embeddings capture semantic relationships between words in a continuous vector space.",
         ]

In [5]:
def one_hot_encoding(text):
    words = text.split()
    print(words)
    word_to_index = {word: i for i, word in enumerate(set(words))}
    print(word_to_index)

In [6]:
def one_hot_encoding(text):
    words = text.split()
    vocab_size = len(set(words))
    word_to_index = {word: i for i, word in enumerate(set(words))}
    one_hot_vector = [0] * vocab_size
    for word in words:
        index = word_to_index[word]
        one_hot_vector[index] = 1
    return one_hot_vector

In [7]:
text = corpus[0]

In [8]:
text

'Natural language processing is a field of artificial intelligence.'

In [9]:
one_hot_vector=one_hot_encoding(text)

In [10]:
one_hot_vector

[1, 1, 1, 1, 1, 1, 1, 1, 1]

In [11]:
def bag_of_words(corpus):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X.toarray(), vectorizer.get_feature_names_out()

In [12]:
bow_vector, feature_names = bag_of_words(corpus)

In [13]:
bow_vector

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1]], dtype=int64)

In [14]:
feature_names

array(['and', 'applications', 'are', 'artificial', 'between', 'capture',
       'chatbots', 'computers', 'continuous', 'embeddings', 'field',
       'humans', 'in', 'intelligence', 'interaction', 'involves', 'is',
       'it', 'language', 'like', 'natural', 'nlp', 'of', 'processing',
       'relationships', 'semantic', 'space', 'techniques', 'the',
       'translation', 'used', 'using', 'various', 'vector', 'word',
       'words'], dtype=object)

In [16]:
# !pip install jupyterthemes

In [20]:
def ngram(text,n):
    words = text.split()
    ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)] 
    return ngrams

ngram = ngram(text,3)
print(ngram)

[('Natural', 'language', 'processing'), ('language', 'processing', 'is'), ('processing', 'is', 'a'), ('is', 'a', 'field'), ('a', 'field', 'of'), ('field', 'of', 'artificial'), ('of', 'artificial', 'intelligence.')]


In [21]:
def word_embeddings(sentences, vector_size=100, window=5, min_count=1, workers=4):
    model = Word2Vec(sentences, vector_size=vector_size,window=window,min_count=min_count,workers=workers)
    return model 

In [22]:
embedding_model = word_embeddings([text.split()])

In [23]:
text

'Natural language processing is a field of artificial intelligence.'

In [26]:
embedding_model.wv['artificial']

array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419385e-03,
        7.4669183e-03, -6.1676754e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400505e-03, -6.1735227e-03, -4.1022300e-04, -8.3689485e-03,
       -5.6000124e-03,  7.1045388e-03,  3.3525396e-03,  7.2256695e-03,
        6.8002474e-03,  7.5307419e-03, -3.7891543e-03, -5.6180597e-04,
        2.3483764e-03, -4.5190323e-03,  8.3887316e-03, -9.8581640e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328315e-03,  4.3981876e-03,
       -1.7395747e-03,  6.7113843e-03,  9.9648498e-03, -4.3624435e-03,
       -5.9933780e-04, -5.6956373e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384968e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895042e-03, -9.1558648e-03, -3.5575271e-04,
       -3.0998408e-03,  7.8943167e-03,  5.9385742e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900408e-03,  7.8175711e-03, -9.5101865e-03,
       -2.0553112e-04,  3.4691966e-03, -9.3897223e-04,  8.3817719e-03,
      

In [29]:
def skip_gram(sentences, vector_size=100, window=5, min_count=1, workers=4):
    model = Word2Vec(sentences, vector_size=vector_size, window= window, min_count= min_count, workers= workers)
    return model


skip_gram_vector = skip_gram([text.split()])


In [31]:
skip_gram_vector.wv['artificial']

array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419385e-03,
        7.4669183e-03, -6.1676754e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400505e-03, -6.1735227e-03, -4.1022300e-04, -8.3689485e-03,
       -5.6000124e-03,  7.1045388e-03,  3.3525396e-03,  7.2256695e-03,
        6.8002474e-03,  7.5307419e-03, -3.7891543e-03, -5.6180597e-04,
        2.3483764e-03, -4.5190323e-03,  8.3887316e-03, -9.8581640e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328315e-03,  4.3981876e-03,
       -1.7395747e-03,  6.7113843e-03,  9.9648498e-03, -4.3624435e-03,
       -5.9933780e-04, -5.6956373e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384968e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895042e-03, -9.1558648e-03, -3.5575271e-04,
       -3.0998408e-03,  7.8943167e-03,  5.9385742e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900408e-03,  7.8175711e-03, -9.5101865e-03,
       -2.0553112e-04,  3.4691966e-03, -9.3897223e-04,  8.3817719e-03,
      