# 1. Corpus Preprocessing

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

corpus = '''India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[25] is a country in South Asia...''' 

corpus = corpus.replace("[25]", "").replace("[f]", "").replace(")", "")
print(corpus)

India, officially the Republic of India (Hindi: Bhārat Gaṇarājya, is a country in South Asia...


# 2. Stop Words Removal

In [3]:
words = []
for word in word_tokenize(corpus):
    if (word.lower() not in stopwords.words('english')) and (len(word) >= 2):
        words.append(word.lower())

print("Filtered Words:", words)

Filtered Words: ['india', 'officially', 'republic', 'india', 'hindi', 'bhārat', 'gaṇarājya', 'country', 'south', 'asia', '...']


# 3. Building Vocabulary

In [4]:
vocab = list(set(words)) 
print("Vocabulary Size:", len(vocab)) 
print("Sample Vocabulary:", vocab[:5])

Vocabulary Size: 10
Sample Vocabulary: ['country', 'india', 'south', 'republic', 'bhārat']


# 4. Creating Encoders and Decoders

In [6]:
num = 1
word_to_num = {}
num_to_word = {}

for word in vocab:
    word_to_num[word] = num
    num_to_word[num] = word
    num += 1

print("Word-to-Number:", word_to_num['south'])  
print("Number-to-Word:", num_to_word[2])       

Word-to-Number: 3
Number-to-Word: india


# Sentence Tokenizer

In [7]:
for sent in sent_tokenize(corpus):
    print(sent)

India, officially the Republic of India (Hindi: Bhārat Gaṇarājya, is a country in South Asia...


In [8]:
for sent in sent_tokenize(corpus):
    print(word_tokenize(sent))

['India', ',', 'officially', 'the', 'Republic', 'of', 'India', '(', 'Hindi', ':', 'Bhārat', 'Gaṇarājya', ',', 'is', 'a', 'country', 'in', 'South', 'Asia', '...']


In [9]:
for sent in sent_tokenize(corpus):
    for word in word_tokenize(sent):
        if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
            print(word,end=' ')
    print()

India officially Republic India Hindi Bhārat Gaṇarājya country South Asia ... 


In [10]:
words=[]
for word in word_tokenize(corpus):
    if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
        words.append(word.lower())

vocab=list(set(words))
len(vocab)

num=1
word_to_num={}
num_to_word={}
for word in vocab:
    word_to_num[word]=num
    num_to_word[num]=word
    num+=1

In [11]:
for sent in sent_tokenize(corpus):
    for word in word_tokenize(sent):
        if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
            print(word,end=' ')
            print(word_to_num[word.lower()],end=' ')
    print()

India 2 officially 7 Republic 4 India 2 Hindi 8 Bhārat 5 Gaṇarājya 9 country 1 South 3 Asia 10 ... 6 


In [12]:
data=[]
for sent in sent_tokenize(corpus):
    temp=[]
    for word in word_tokenize(sent):
        if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
            #print(word,end=' ')
            temp.append(word_to_num[word.lower()])
    print(temp)
    data.append(temp)
    print()

[2, 7, 4, 2, 8, 5, 9, 1, 3, 10, 6]



In [13]:
for sent in data:
    print(sent)

[2, 7, 4, 2, 8, 5, 9, 1, 3, 10, 6]


In [14]:
for sent in data:
    for word in sent:
        print(num_to_word[word],end=' ')
    print()

india officially republic india hindi bhārat gaṇarājya country south asia ... 


# Text Encoding - Decoding | Without Stop Words

In [15]:
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

corpus='''India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[25] is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.'''

corpus = corpus.replace("[25]" , "")    
corpus = corpus.replace("[f]" , "")   
corpus = corpus.replace(")" , "")

In [16]:
words=[]
for word in word_tokenize(corpus):
    if(len(word)==1):
        if((ord(word)>=97 and ord(word)<=122) or (ord(word)>=65 and ord(word)<=90)):
            words.append(word.lower())
    else:
        words.append(word.lower())

In [17]:
vocab=list(set(words))
print(len(vocab))

61


In [18]:
num=1
word_to_num={}
num_to_word={}
for word in vocab:
    word_to_num[word]=num
    num_to_word[num]=word
    num+=1

In [19]:
data=[]
for sent in sent_tokenize(corpus):
    temp=[]
    for word in word_tokenize(sent):
        if(len(word)==1):
            if((ord(word)>=97 and ord(word)<=122) or (ord(word)>=65 and ord(word)<=90)):
                temp.append(word_to_num[word.lower()])
        else:
            temp.append(word_to_num[word.lower()])
    data.append(temp)
print(data)

[[19, 9, 28, 21, 47, 19, 31, 25, 11, 35, 61, 48, 4, 54, 43], [56, 35, 28, 39, 48, 3, 8, 28, 42, 12, 48, 44, 28, 20, 12, 40, 4, 28, 52], [49, 3, 28, 7, 22, 46, 28, 54, 28, 50, 59, 46, 28, 14, 44, 28, 30, 47, 16, 46, 28, 15, 56, 45, 38, 51, 53, 27, 17, 28, 24, 60, 37, 44, 18, 17, 28, 33, 44, 26, 44, 58, 17, 28, 1], [4, 28, 7, 22, 19, 35, 4, 28, 2, 47, 13, 6, 44, 28, 34, 55, 10, 44, 57, 36, 29, 61, 41, 5, 53, 32, 58, 44, 23]]


In [20]:
for sent in data:
    for word in sent:
        print(num_to_word[word],end=' ')
    print()

india officially the republic of india hindi bhārat gaṇarājya is a country in south asia 
it is the seventh-largest country by area the second-most populous country and the most populous democracy in the world 
bounded by the indian ocean on the south the arabian sea on the southwest and the bay of bengal on the southeast it shares land borders with pakistan to the west china nepal and bhutan to the north and bangladesh and myanmar to the east 
in the indian ocean india is in the vicinity of sri lanka and the maldives its andaman and nicobar islands share a maritime border with thailand myanmar and indonesia 
