In [1]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import nltk
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


In [4]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
medical_text = """
Diabetes mellitus is a chronic condition characterized by elevated blood glucose levels.
Patients with diabetes often require insulin therapy and lifestyle modifications.
Early diagnosis and proper management can prevent complications such as neuropathy,
retinopathy, and cardiovascular diseases.
"""


In [8]:
nltk.download('punkt_tab')
sentences_nltk = sent_tokenize(medical_text)
print("NLTK Sentence Tokenization:")
sentences_nltk

[nltk_data] Downloading package punkt_tab to /root/nltk_data...


NLTK Sentence Tokenization:


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['\nDiabetes mellitus is a chronic condition characterized by elevated blood glucose levels.',
 'Patients with diabetes often require insulin therapy and lifestyle modifications.',
 'Early diagnosis and proper management can prevent complications such as neuropathy,\nretinopathy, and cardiovascular diseases.']

In [9]:
sentences_nltk = sent_tokenize(medical_text)
print("NLTK Sentence Tokenization:")
sentences_nltk


NLTK Sentence Tokenization:


['\nDiabetes mellitus is a chronic condition characterized by elevated blood glucose levels.',
 'Patients with diabetes often require insulin therapy and lifestyle modifications.',
 'Early diagnosis and proper management can prevent complications such as neuropathy,\nretinopathy, and cardiovascular diseases.']

In [10]:
words_nltk = word_tokenize(medical_text)
print("NLTK Word Tokenization:")
words_nltk


NLTK Word Tokenization:


['Diabetes',
 'mellitus',
 'is',
 'a',
 'chronic',
 'condition',
 'characterized',
 'by',
 'elevated',
 'blood',
 'glucose',
 'levels',
 '.',
 'Patients',
 'with',
 'diabetes',
 'often',
 'require',
 'insulin',
 'therapy',
 'and',
 'lifestyle',
 'modifications',
 '.',
 'Early',
 'diagnosis',
 'and',
 'proper',
 'management',
 'can',
 'prevent',
 'complications',
 'such',
 'as',
 'neuropathy',
 ',',
 'retinopathy',
 ',',
 'and',
 'cardiovascular',
 'diseases',
 '.']

In [11]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(medical_text)

sentences_spacy = [sent.text for sent in doc.sents]
print("spaCy Sentence Tokenization:")
sentences_spacy


spaCy Sentence Tokenization:


['\nDiabetes mellitus is a chronic condition characterized by elevated blood glucose levels.\n',
 'Patients with diabetes often require insulin therapy and lifestyle modifications.\n',
 'Early diagnosis and proper management can prevent complications such as neuropathy,\nretinopathy, and cardiovascular diseases.\n']

In [12]:
words_spacy = [token.text for token in doc if not token.is_punct]
print("spaCy Word Tokenization:")
words_spacy


spaCy Word Tokenization:


['\n',
 'Diabetes',
 'mellitus',
 'is',
 'a',
 'chronic',
 'condition',
 'characterized',
 'by',
 'elevated',
 'blood',
 'glucose',
 'levels',
 '\n',
 'Patients',
 'with',
 'diabetes',
 'often',
 'require',
 'insulin',
 'therapy',
 'and',
 'lifestyle',
 'modifications',
 '\n',
 'Early',
 'diagnosis',
 'and',
 'proper',
 'management',
 'can',
 'prevent',
 'complications',
 'such',
 'as',
 'neuropathy',
 '\n',
 'retinopathy',
 'and',
 'cardiovascular',
 'diseases',
 '\n']

In [13]:
stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(word) for word in words_nltk if word.isalpha()]
print("Stemmed Words:")
stemmed_words


Stemmed Words:


['diabet',
 'mellitu',
 'is',
 'a',
 'chronic',
 'condit',
 'character',
 'by',
 'elev',
 'blood',
 'glucos',
 'level',
 'patient',
 'with',
 'diabet',
 'often',
 'requir',
 'insulin',
 'therapi',
 'and',
 'lifestyl',
 'modif',
 'earli',
 'diagnosi',
 'and',
 'proper',
 'manag',
 'can',
 'prevent',
 'complic',
 'such',
 'as',
 'neuropathi',
 'retinopathi',
 'and',
 'cardiovascular',
 'diseas']

In [14]:
lemmatized_words = [token.lemma_ for token in doc if token.is_alpha]
print("Lemmatized Words:")
lemmatized_words


Lemmatized Words:


['diabetes',
 'mellitus',
 'be',
 'a',
 'chronic',
 'condition',
 'characterize',
 'by',
 'elevated',
 'blood',
 'glucose',
 'level',
 'patient',
 'with',
 'diabete',
 'often',
 'require',
 'insulin',
 'therapy',
 'and',
 'lifestyle',
 'modification',
 'early',
 'diagnosis',
 'and',
 'proper',
 'management',
 'can',
 'prevent',
 'complication',
 'such',
 'as',
 'neuropathy',
 'retinopathy',
 'and',
 'cardiovascular',
 'disease']