In [3]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
corpus = """
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language.
The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language.
"""

In [5]:
sentences = sent_tokenize(corpus)

In [6]:
type(sentences)

list

In [7]:
processed_sentences=[]
stopW= set(stopwords.words('english'))


In [9]:
for sent in sentences:
    # Remove punctuation and lowercase
    sent = re.sub(r'[^\w\s]', '', sent.lower())
    
    # Tokenize words
    words = word_tokenize(sent)
    
    # Remove stopwords
    words = [word for word in words if word not in stopW]
    
    processed_sentences.append(words)

# Final processed data
print(processed_sentences)

[['natural', 'language', 'processing', 'nlp', 'field', 'artificial', 'intelligence', 'focuses', 'interaction', 'computers', 'humans', 'natural', 'language'], ['ultimate', 'goal', 'nlp', 'enable', 'computers', 'understand', 'interpret', 'generate', 'human', 'language']]


In [12]:
#!pip install gensim


In [11]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=processed_sentences,  # list of tokenized sentences
    vector_size=100,                # dimensionality of word vectors
    window=2,                       # context window size
    min_count=1,                    # ignore words with freq < 1
    sg=1,                           # 1 = Skip-Gram, 0 = CBOW
    workers=4,                      # # of CPU threads
    epochs=100                      # # of training iterations
)


In [13]:
model.save("skipgram_word2vec.model")


In [14]:
model = Word2Vec.load("skipgram_word2vec.model")

In [15]:
print(model.wv.index_to_key)  # List of words in vocab

['language', 'natural', 'nlp', 'computers', 'focuses', 'processing', 'field', 'artificial', 'intelligence', 'human', 'generate', 'humans', 'ultimate', 'goal', 'enable', 'understand', 'interpret', 'interaction']


In [16]:
print(model.wv['language'])  # Vector for the word 'language'


[-1.3187095e-03  1.2350722e-03  5.8473037e-03  9.2951693e-03
 -8.8877045e-03 -7.7542122e-03  7.1425787e-03  1.0321400e-02
 -5.8120610e-03 -4.5881798e-03  7.7912915e-03 -2.6766502e-03
 -4.8213499e-03  6.3572335e-03 -4.6341312e-03 -1.9697361e-03
  3.3729256e-03  1.1767640e-03 -9.0509467e-03 -1.1164593e-02
  7.8347856e-03  5.6690876e-03  8.3271936e-03  6.7085651e-04
  5.6954161e-03 -2.6823948e-03 -1.1062155e-03  5.9890305e-03
 -8.0015352e-03 -3.9047145e-03 -7.4699936e-03 -1.7046328e-03
  9.8675909e-03 -8.0135381e-03 -2.4315338e-03 -1.2301764e-03
  8.8280523e-03 -5.9090531e-03 -8.3310518e-04 -4.6948809e-03
 -8.8789547e-03  4.5614010e-03 -9.4836541e-03 -3.5064893e-03
  8.3853287e-04 -3.5647498e-04 -8.4183803e-03  9.1853468e-03
  5.3080167e-03  9.2860665e-03 -8.2492856e-03  4.1164495e-03
 -3.8698900e-03  1.6463343e-04  8.0873501e-03 -4.7026007e-03
  4.8787110e-03 -7.1137217e-03 -3.2413711e-03  9.8163206e-03
 -1.7456070e-03 -1.9136902e-04 -3.1132421e-03 -7.5114374e-03
 -1.6218456e-03  3.10179

In [17]:
similar_words = model.wv.most_similar('language', topn=5)
print(similar_words)


[('interpret', 0.26646050810813904), ('human', 0.21585491299629211), ('artificial', 0.1391703337430954), ('ultimate', 0.12347730249166489), ('humans', 0.11402726918458939)]


In [18]:
score = model.wv.similarity('language', 'computers')
print(score)


-0.032543037


In [20]:
result = model.wv.most_similar(positive=['intelligence', 'human'], negative=['computers'], topn=3)
print(result)


[('language', 0.19086071848869324), ('nlp', 0.10178055614233017), ('understand', 0.08204130828380585)]


In [21]:
print('data' in model.wv.key_to_index)


False
