## Q1 Write a python code to remove punctuations, URLs and stop words.

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
example_input = "I love reading articles on AI, ML, and robotics. One of my favorite websites for this is https://www.example-ai-ml.com. The content there is fantastic! But sometimes, the articles can be a bit lengthy and repetitive. I often visit the website to learn new things and improve my knowledge in these fields."

In [None]:
def remove_punctuations(text):
    # Remove punctuations using regular expressions
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text

# Remove punctuations
cleaned_text = remove_punctuations(example_input)
print("Text without punctuations:")
print(cleaned_text)

Text without punctuations:
I love reading articles on AI ML and robotics One of my favorite websites for this is httpswwwexampleaimlcom The content there is fantastic But sometimes the articles can be a bit lengthy and repetitive I often visit the website to learn new things and improve my knowledge in these fields


In [None]:
def remove_urls(text):
    # Remove URLs using regular expressions
    cleaned_text = re.sub(r'http\S+|www\S+', '', text)
    return cleaned_text

# Remove URLs
cleaned_text = remove_urls(cleaned_text)
print("Text without URLs:")
print(cleaned_text)

Text without URLs:
I love reading articles on AI ML and robotics One of my favorite websites for this is  The content there is fantastic But sometimes the articles can be a bit lengthy and repetitive I often visit the website to learn new things and improve my knowledge in these fields


In [None]:
def remove_stop_words(text):
    # Remove stop words using nltk library
    stop_words = set(stopwords.words('english'))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

# Remove stop words
cleaned_text = remove_stop_words(cleaned_text)
print("Text without stop words:")
print(cleaned_text)

Text without stop words:
love reading articles AI ML robotics One favorite websites content fantastic sometimes articles bit lengthy repetitive often visit website learn new things improve knowledge fields


## Q2 Write a python code perform stemmer operation using Porterstemmer, Snowballstemmer, Lancasterstemmer, RegExpStemmer

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import RegexpStemmer

In [None]:
# PorterStemmer
porter = PorterStemmer()
words = ["Connects" ,"Connecting","Connections","Connected","Connection","Connectings","Connect"]
for word in words:
  print(word,"\t---> ",porter.stem(word))

Connects 	--->  connect
Connecting 	--->  connect
Connections 	--->  connect
Connected 	--->  connect
Connection 	--->  connect
Connectings 	--->  connect
Connect 	--->  connect


In [None]:
snowball = SnowballStemmer(language="english")
words = ["generous","generate","generously","generation"]
for word in words:
  print(word,"\t---> ",snowball.stem(word))

generous 	--->  generous
generate 	--->  generat
generously 	--->  generous
generation 	--->  generat


In [None]:
lancaster = LancasterStemmer()
words = ["eating","eats","eaten","puts","putting"]
for word in words:
  print(word,"\t---> ",lancaster.stem(word))

eating 	--->  eat
eats 	--->  eat
eaten 	--->  eat
puts 	--->  put
putting 	--->  put


In [None]:
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
words = ['mass','was','bee','computer','advisable']
for word in words:
  print(word,"\t--->",regexp.stem(word))

mass 	---> mas
was 	---> was
bee 	---> bee
computer 	---> computer
advisable 	---> advis


## Q3 Write a python code to demonstrate the comparative study of all 4 stemmers for a given text corpus.

In [None]:
porter = PorterStemmer()
snowball = SnowballStemmer(language="english")
lancaster = LancasterStemmer()
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)

words = ['computer','advisable','eating','dancing','generous','joyfully']
stemmed_words = []
for word in words:
  stemmed_words.append([word, porter.stem(word), snowball.stem(word), lancaster.stem(word), regexp.stem(word)])

from tabulate import tabulate

headers = ["WORD", 'PORTER STEM', 'SNOWBALL STEM', 'LANCASTER STEM', 'REGEXP STEM']
print(tabulate(stemmed_words, headers = headers))

WORD       PORTER STEM    SNOWBALL STEM    LANCASTER STEM    REGEXP STEM
---------  -------------  ---------------  ----------------  -------------
computer   comput         comput           comput            computer
advisable  advis          advis            adv               advis
eating     eat            eat              eat               eat
dancing    danc           danc             dant              danc
generous   gener          generous         gen               generou
joyfully   joy            joy              joy               joyfully


## Q4 Write a python code perform lemmatization using NLTK library.

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
from tabulate import tabulate

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
wnl = WordNetLemmatizer()
words = ['dances', 'employees', 'rashes', 'experiments', 'converts']
headers = ['word', 'lemmatized word']
lemm_words = []
for word in words:
  lemm_words.append([word,wnl.lemmatize(word)])
print(tabulate(lemm_words, headers = headers))


word         lemmatized word
-----------  -----------------
dances       dance
employees    employee
rashes       rash
experiments  experiment
converts     convert


## Q5 Write a python code perform lemmatization using Spacy library.

In [6]:
import spacy

In [7]:
nlp = spacy.load('en_core_web_sm')
words = ['happening', 'employments', 'rashes', 'experimental', 'conversion']
lemm_words = []
for word in words:
  doc = nlp(word)
  lemm_words.append([word, doc[0].lemma_])
print(tabulate(lemm_words, headers = headers))

word          lemmatized word
------------  -----------------
happening     happen
employments   employment
rashes        rashe
experimental  experimental
conversion    conversion


##Q6 Compare the results lemmatization with Spacy and NLTK for the corpus given below-
##walking, is , main, animals , foxes, are, jumping , sleeping.
##Write your conclusion for the results obtained.

In [None]:
wnl = WordNetLemmatizer()
words = ['walking', 'is ', 'main', 'animals' , 'foxes', 'are', 'jumping ', 'sleeping']
lemm_words = []
headers = ["WORD", 'NLTK (ADJECTIVE)', 'NLTK (NOUN)', 'NLTK (VERB)', 'NLTK (ADVERB)', 'SPACY']
for word in words:
  doc = nlp(word)
  lemm_words.append([word, wnl.lemmatize(word, pos = 'a'), wnl.lemmatize(word, pos = 'n'), wnl.lemmatize(word, pos = 'v'), wnl.lemmatize(word, pos = 'r'), doc[0].lemma_])

print(tabulate(lemm_words, headers = headers))

WORD      NLTK (ADJECTIVE)    NLTK (NOUN)    NLTK (VERB)    NLTK (ADVERB)    SPACY
--------  ------------------  -------------  -------------  ---------------  -------
walking   walking             walking        walk           walking          walk
is        is                  is             is             is               be
main      main                main           main           main             main
animals   animals             animal         animals        animals          animal
foxes     foxes               fox            fox            foxes            fox
are       are                 are            be             are              be
jumping   jumping             jumping        jumping        jumping          jump
sleeping  sleeping            sleeping       sleep          sleeping         sleep


#Post Lab Questions:
## What all python Libraries are available to work with Indian languages like Hindi, Punjabi, Marathi..etc?

##Marathi

In [13]:
!pip install inltk --quiet
!pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html --quiet

[31mERROR: Could not find a version that satisfies the requirement torch==1.3.1+cpu (from versions: 1.11.0, 1.11.0+cpu, 1.11.0+cu102, 1.11.0+cu113, 1.11.0+cu115, 1.11.0+rocm4.3.1, 1.11.0+rocm4.5.2, 1.12.0, 1.12.0+cpu, 1.12.0+cu102, 1.12.0+cu113, 1.12.0+cu116, 1.12.0+rocm5.0, 1.12.0+rocm5.1.1, 1.12.1, 1.12.1+cpu, 1.12.1+cu102, 1.12.1+cu113, 1.12.1+cu116, 1.12.1+rocm5.0, 1.12.1+rocm5.1.1, 1.13.0, 1.13.0+cpu, 1.13.0+cu116, 1.13.0+cu117, 1.13.0+cu117.with.pypi.cudnn, 1.13.0+rocm5.1.1, 1.13.0+rocm5.2, 1.13.1, 1.13.1+cpu, 1.13.1+cu116, 1.13.1+cu117, 1.13.1+cu117.with.pypi.cudnn, 1.13.1+rocm5.1.1, 1.13.1+rocm5.2, 2.0.0, 2.0.0+cpu, 2.0.0+cpu.cxx11.abi, 2.0.0+cu117, 2.0.0+cu117.with.pypi.cudnn, 2.0.0+cu118, 2.0.0+rocm5.3, 2.0.0+rocm5.4.2, 2.0.1, 2.0.1+cpu, 2.0.1+cpu.cxx11.abi, 2.0.1+cu117, 2.0.1+cu117.with.pypi.cudnn, 2.0.1+cu118, 2.0.1+rocm5.3, 2.0.1+rocm5.4.2)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.3.1+cpu[0m[31m
[0m

In [14]:
import collections.abc
collections.Iterable = collections.abc.Iterable
collections.Mapping = collections.abc.Mapping
collections.MutableSet = collections.abc.MutableSet
collections.MutableMapping = collections.abc.MutableMapping

In [None]:
from inltk.inltk import setup
setup('mr')

In [15]:
import inltk
from inltk.inltk import tokenize
marathi_text = "केलेल्या अनेक कष्टांना विचारता आपले जीवन सुंदर झाले, याचं सारंगी आपल्या मनाला सुगंधित करणारं एक आनंददायी अनुभव आहे."
tokens = tokenize(marathi_text, "mr")
print(tokens)

['▁केलेल्या', '▁अनेक', '▁कष्ट', 'ांना', '▁विचार', 'ता', '▁आपले', '▁जीवन', '▁सुंदर', '▁झाले', ',', '▁या', 'चं', '▁सारंगी', '▁आपल्या', '▁मनाला', '▁सुगंध', 'ित', '▁करणार', 'ं', '▁एक', '▁आनंददाय', 'ी', '▁अनुभव', '▁आहे', '.']


###Hindi

In [17]:
from inltk.inltk import setup
setup('hi')

RuntimeError: ignored

Downloading Model. This might take time, depending on your internet connection. Please be patient.
We'll only do this for the first time.


In [19]:
import inltk
from inltk.inltk import tokenize
hindi_text = "मैंने अपने दोस्त को पुरानी यादें ताजा करने के लिए एक साथीक यात्रा पर बुलाया, जहां हमने बहुत मजेदार और रोमांचक लम्हों को जीवंत किया।"
tokens = tokenize(hindi_text, "hi")
print(tokens)

['▁मैंने', '▁अपने', '▁दोस्त', '▁को', '▁पुरानी', '▁याद', 'ें', '▁ताजा', '▁करने', '▁के', '▁लिए', '▁एक', '▁साथ', 'ीक', '▁यात्रा', '▁पर', '▁बुलाया', ',', '▁जहां', '▁हमने', '▁बहुत', '▁मजेदार', '▁और', '▁रोमांचक', '▁लम्', 'हों', '▁को', '▁जीवंत', '▁किया', '।']


###Gujarati

In [22]:
from inltk.inltk import setup
setup('gu')

RuntimeError: ignored

Done!


In [23]:
import inltk
from inltk.inltk import tokenize
gujarati_text = "મારા મિત્રોને જુઓની યાદોને તાજી કરવા માટે મારી સાથીની યાત્રાએ બોજ મોજનારા અને ઉલ્લાસદાયી પલાઓ જીવંત બનાવ્યા."
tokens = tokenize(gujarati_text , "gu")
print(tokens)

['▁મારા', '▁મિત્રો', 'ને', '▁જુઓ', 'ની', '▁યાદ', 'ોને', '▁તાજી', '▁કરવા', '▁માટે', '▁મારી', '▁સાથી', 'ની', '▁યાત્રા', 'એ', '▁બોજ', '▁મોજ', 'નારા', '▁અને', '▁ઉલ્લાસ', 'દા', 'યી', '▁પલા', 'ઓ', '▁જીવંત', '▁બનાવ્યા', '.']
