In [9]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 12.2 MB/s eta 0:00:01
     -------------- ------------------------- 4.7/12.8 MB 11.9 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 11.6 MB/s eta 0:00:01
     --------------------------- ------------ 8.7/12.8 MB 11.7 MB/s eta 0:00:01
     --------------------------- ------------ 8.7/12.8 MB 11.7 MB/s eta 0:00:01
     --------------------------- ------------ 8.7/12.8 MB 11.7 MB/s eta 0:00:01
     -------------------------------- ------- 10.5/12.8 MB 7.2 MB/s eta 0:00:01
     ---------------------------------------  12.6/12.8 MB 7.6 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 7.6 MB/s eta 0:00:00
Installing collected packages: e

In [5]:
pip install spacy


Collecting spacy
  Downloading spacy-3.8.4-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp312-cp312-win_amd64

In [15]:
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\CVR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CVR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
import spacy
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [19]:
# Load spaCy's NLP model
nlp = spacy.load("en_core_web_sm")
 

In [21]:
# Input text
text = "John works at Google in California. He loves programming and playing football."

In [23]:
# 1. Segmentation (sentence tokenization)
sentences = sent_tokenize(text)
print("Segmentation:", sentences)
 

Segmentation: ['John works at Google in California.', 'He loves programming and playing football.']


In [25]:
# 2. Tokenization (word tokenization)
tokens = [word_tokenize(sentence) for sentence in sentences]
print("Tokenization:", tokens)

Tokenization: [['John', 'works', 'at', 'Google', 'in', 'California', '.'], ['He', 'loves', 'programming', 'and', 'playing', 'football', '.']]


In [27]:
# 3. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [[stemmer.stem(token) for token in sentence] for sentence in tokens]
print("Stemming:", stemmed_tokens)

Stemming: [['john', 'work', 'at', 'googl', 'in', 'california', '.'], ['he', 'love', 'program', 'and', 'play', 'footbal', '.']]


In [29]:
# 4. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [[lemmatizer.lemmatize(token) for token in sentence] for sentence in tokens]
print("Lemmatization:", lemmatized_tokens)
 

Lemmatization: [['John', 'work', 'at', 'Google', 'in', 'California', '.'], ['He', 'love', 'programming', 'and', 'playing', 'football', '.']]


In [31]:
# 5. POS Tagging
doc = nlp(text)
pos_tags = [(token.text, token.pos_) for token in doc]
print("POS Tagging:", pos_tags)
 

POS Tagging: [('John', 'PROPN'), ('works', 'VERB'), ('at', 'ADP'), ('Google', 'PROPN'), ('in', 'ADP'), ('California', 'PROPN'), ('.', 'PUNCT'), ('He', 'PRON'), ('loves', 'VERB'), ('programming', 'VERB'), ('and', 'CCONJ'), ('playing', 'VERB'), ('football', 'NOUN'), ('.', 'PUNCT')]


In [33]:
# 6. Named Entity Recognition (NER)
entities = [(entity.text, entity.label_) for entity in doc.ents]
print("Named Entities:", entities)

Named Entities: [('John', 'PERSON'), ('Google', 'ORG'), ('California', 'GPE')]


In [35]:
# 7. Parsing (Dependency Parsing)
for sent in doc.sents:
    for token in sent:
        print(f'{token.text:10} -> {token.dep_:10} -> {token.head.text}')

John       -> nsubj      -> works
works      -> ROOT       -> works
at         -> prep       -> works
Google     -> pobj       -> at
in         -> prep       -> works
California -> pobj       -> in
.          -> punct      -> works
He         -> nsubj      -> loves
loves      -> ROOT       -> loves
programming -> xcomp      -> loves
and        -> cc         -> programming
playing    -> conj       -> programming
football   -> dobj       -> playing
.          -> punct      -> loves
