In [3]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nltk
[0mSuccessfully installed nltk-3.9.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import nltk
# import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
# nlp = spacy.load('en_core_web_sm')

document = """Natural language processing (NLP) is a sub-field of artificial intelligence (AI) that focuses on the interaction between computers and human language.
The ultimate goal of NLP is to enable computers to better understand, interpret, and generate human language."""
document_french = """Le traitement du langage naturel (TALN) est un sous-domaine de l'intelligence artificielle (IA) qui se concentre sur l'interaction entre les ordinateurs et le langage humain.
L'objectif ultime du TALN est de permettre aux ordinateurs de mieux comprendre, interpréter et générer le langage humain."""

In [None]:
tokens = word_tokenize(document_french)

doc = nlp(document)
pos_tags = [(token.text, token.pos_) for token in doc]

In [None]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

In [None]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

snow_stemmer = SnowballStemmer(language='french')
snow_stemmed_tokens = [snow_stemmer.stem(word) for word in filtered_tokens]

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

In [None]:
print(f"Original Tokens: {tokens} \n")
print(f"POS Tags: {pos_tags} \n")
print(f"Filtered Tokens (Stop Words Removed): {filtered_tokens} \n")
print(f"Stemmed Tokens: {stemmed_tokens} \n")
print(f"Snow-Stemmed Tokens: {snow_stemmed_tokens} \n")
print(f"Lemmatized Tokens: {lemmatized_tokens} \n")

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    """Natural language processing (NLP) is a sub-field of artificial intelligence (AI) that focuses on the interaction between computers and human language.""",
    """The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language.""",
    """AI and machine learning are transforming the way we live and work."""
]

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')

tfidf_matrix = vectorizer.fit_transform(documents)

terms = vectorizer.get_feature_names_out()

In [8]:
tfidf_array = tfidf_matrix.toarray()

print("Terms:", terms)
for doc_idx, doc in enumerate(tfidf_array):
    print(f"\nDocument {doc_idx + 1} TF-IDF values:")
    for term_idx, value in enumerate(doc):
        if value > 0: 
            print(f"{terms[term_idx]}: {value:.4f}")

Terms: ['ai' 'artificial' 'computers' 'enable' 'field' 'focuses' 'generate'
 'goal' 'human' 'intelligence' 'interaction' 'interpret' 'language'
 'learning' 'live' 'machine' 'natural' 'nlp' 'processing' 'sub'
 'transforming' 'ultimate' 'understand' 'way' 'work']

Document 1 TF-IDF values:
ai: 0.2140
artificial: 0.2814
computers: 0.2140
field: 0.2814
focuses: 0.2814
human: 0.2140
intelligence: 0.2814
interaction: 0.2814
language: 0.4280
natural: 0.2814
nlp: 0.2140
processing: 0.2814
sub: 0.2814

Document 2 TF-IDF values:
computers: 0.2638
enable: 0.3468
generate: 0.3468
goal: 0.3468
human: 0.2638
interpret: 0.3468
language: 0.2638
nlp: 0.2638
ultimate: 0.3468
understand: 0.3468

Document 3 TF-IDF values:
ai: 0.2965
learning: 0.3899
live: 0.3899
machine: 0.3899
transforming: 0.3899
way: 0.3899
work: 0.3899
