In [76]:
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

## Paragraph

In [77]:
paragraph = """Python is an interpreted, high-level, general-purpose programming language. 
Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability 
with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to 
help programmers write clear, logical code for small and large-scale projects
Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, 
including structured (particularly, procedural), object-oriented, and functional programming. 
Python is often described as a "batteries included" language due to its comprehensive standard library
Python was conceived in the late 1980s as a successor to the ABC language. Python 2.0, released in 2000, 
introduced features like list comprehensions and a garbage collection system capable of 
collecting reference cycles. Python 3.0, released in 2008, was a major revision of the 
language that is not completely backward-compatible, and much Python 2 code does not run unmodified on Python 3.
The Python 2 language was officially discontinued in 2020 (first planned for 2015), and "Python 2.7.18 is the 
last Python 2.7 release and therefore the last Python 2 release."No more security patches or other 
improvements will be released for it.With Python 2's end-of-life, only Python and later are supported.
Python interpreters are available for many operating systems. A global community of programmers develops and 
maintains CPython, an open source[34] reference implementation. A non-profit organization, the Python Software 
Foundation, manages and directs resources for Python and CPython development."""

## Pre-Processing The Data

#### Removing Numbers / Question Marks etc

In [78]:
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)

#### Tokenizing Into Sentences

In [79]:
sentences = nltk.sent_tokenize(text)
len(sentences)

17

#### Removing Stop Words And Applying Lemmatization

In [80]:
lemmatizer = WordNetLemmatizer()

In [81]:
new_para = []
for i in sentences:
    process_list = [lemmatizer.lemmatize(j) for j in nltk.word_tokenize(i) if j not in stopwords.words('english')]
    new_para.append(process_list) 

# new_para = [ [....],[.....],[.....] ... ]

## Word2Vec

#### Creating Model

In [86]:
model = Word2Vec(new_para , min_count = 1) # min count : if any word is present less than 1 time , remove it . 
                                           # Since , my data is very small , hence taking min_count as 1
                                           # new_para must be of [[..],[..],[..]],[...], ]

In [87]:
model

<gensim.models.word2vec.Word2Vec at 0x7ff35e69d1d0>

#### Listing All Words Taken By Model

In [92]:
words = model.wv.vocab

In [93]:
words

{'python': <gensim.models.keyedvectors.Vocab at 0x7ff35e6957d0>,
 'interpreted': <gensim.models.keyedvectors.Vocab at 0x7ff35e695950>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7ff35e695c10>,
 'high-level': <gensim.models.keyedvectors.Vocab at 0x7ff35e6956d0>,
 'general-purpose': <gensim.models.keyedvectors.Vocab at 0x7ff35e6711d0>,
 'programming': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d310>,
 'language': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d410>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d450>,
 'created': <gensim.models.keyedvectors.Vocab at 0x7ff35e695c50>,
 'guido': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d350>,
 'van': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d490>,
 'rossum': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d4d0>,
 'first': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d510>,
 'released': <gensim.models.keyedvectors.Vocab at 0x7ff35e69d550>,
 "'s": <gensim.models.keyedvectors.Vocab at 0x7ff35e69d590>,
 'design': <gens

#### Getting Vector Of Any Word

In [94]:
vector = model.wv['programming']

In [95]:
vector

array([ 0.00237744,  0.00464998, -0.00496389, -0.00418612,  0.0028864 ,
       -0.0013614 , -0.00193879,  0.00154607,  0.00059281, -0.00328708,
       -0.00039325,  0.00345769,  0.00281165,  0.00245763,  0.00287415,
        0.00390994,  0.0026428 , -0.00363835,  0.00486249,  0.00271834,
       -0.0029558 , -0.00302681,  0.00171569, -0.00370322,  0.00280445,
       -0.00195872,  0.00109434, -0.00298335, -0.00317063,  0.00226641,
       -0.00231398, -0.00124258,  0.00483412, -0.00222271,  0.00360735,
       -0.00209982, -0.00172873,  0.00170555,  0.00119465, -0.00352196,
        0.00100923,  0.00189963, -0.00036752,  0.0002633 , -0.00154922,
       -0.0033674 , -0.00250891, -0.00311234, -0.00454363, -0.00083514,
        0.00234391,  0.00042497, -0.00378361,  0.00278021, -0.00389331,
       -0.00126199,  0.00341026, -0.00482291,  0.00279927,  0.00133561,
        0.00473418,  0.0045723 , -0.00220584,  0.00471628,  0.0020957 ,
       -0.00271961,  0.00060845,  0.00433557, -0.00419068, -0.00

#### Getting Similar Words To Any Word

In [96]:
similar_words = model.wv.most_similar('programming')

In [98]:
similar_words # words which are near the word 'programming' in the vecotor space

[('resource', 0.22052551805973053),
 ('often', 0.20188382267951965),
 ("'s", 0.1920350044965744),
 ('discontinued', 0.16033625602722168),
 ('completely', 0.14215143024921417),
 ('maintains', 0.14009740948677063),
 ('emphasizes', 0.1390928477048874),
 ('reference', 0.1388399749994278),
 ('procedural', 0.13238975405693054),
 ('organization', 0.13017770648002625)]