In [1]:
# Word Tokenization from Scratch
def word_tokenize(text):
    tokens = []
    word = ""
    for char in text:
        if char.isalnum(): # Check if character is alphanumeric (part of a word)
            word += char
            #print(word)
        else:
            if word:  # If we have collected a word, add it to tokens
                tokens.append(word)
                word = ""
            if char.strip():  # Add punctuation or special characters as tokens
                tokens.append(char)
    if word:  # Append the last word if there's any
        tokens.append(word)
    return tokens

# Input Text
text = "I love Python. It's a powerful programming language!"

# Tokenize into words
tokens = word_tokenize(text)
print("Word Tokens:", tokens)


Word Tokens: ['I', 'love', 'Python', '.', 'It', "'", 's', 'a', 'powerful', 'programming', 'language', '!']


In [19]:
# Sentence Tokenization from Scratch
def sentence_tokenize(text):
    sentences = []
    sentence = ""
    for char in text:
        sentence += char
        if char in ".!?":  # End of a sentence
            sentences.append(sentence.strip())
            sentence = ""
    if sentence:  # Append the last sentence if there's any
        sentences.append(sentence.strip())
    return sentences

# Input Text
text = "I love! Python. It's a powerful programming language! Do you like coding?"

# Tokenize into sentences
sentences = sentence_tokenize(text)
print("Sentence Tokens:", sentences)


Sentence Tokens: ['I love!', 'Python.', "It's a powerful programming language!", 'Do you like coding?']


In [3]:
# Character Tokenization from Scratch
def character_tokenize(text):
    return [char for char in text]  # Split into characters

# Input Text
text = "Python"

# Tokenize into characters
characters = character_tokenize(text)
print("Character Tokens:", characters)


Character Tokens: ['P', 'y', 't', 'h', 'o', 'n']


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = ["dog cat dog", "cat dog bat", "dog bat bat"]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Convert the sparse matrix to an array
print(X.toarray())

# Show the vocabulary
print(vectorizer.get_feature_names_out())


[[0 1 2]
 [1 1 1]
 [2 0 1]]
['bat' 'cat' 'dog']


In [5]:
# Example of creating a simple corpus
corpus = [
    "Natural Language Processing is a field of artificial intelligence.",
    "Text classification and sentiment analysis are popular NLP tasks.",
    "Word embeddings are used to represent words in a continuous vector space."
]

# Tokenize the corpus
tokens = [sentence.split() for sentence in corpus]
print(tokens)


[['Natural', 'Language', 'Processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence.'], ['Text', 'classification', 'and', 'sentiment', 'analysis', 'are', 'popular', 'NLP', 'tasks.'], ['Word', 'embeddings', 'are', 'used', 'to', 'represent', 'words', 'in', 'a', 'continuous', 'vector', 'space.']]


In [6]:
import nltk
from nltk.stem import PorterStemmer

# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Example words
words = ["running", "runner", "easily", "faster"]

# Apply stemming
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)


['run', 'runner', 'easili', 'faster']


In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Example words
words = ["running", "better", "cats"]

# Apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in words]
print(lemmatized_words)


['run', 'better', 'cat']


In [8]:
import spacy

# Load the pre-trained model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Apple is looking to buy a startup in the UK."

# Process the text with spaCy
doc = nlp(text)

# Extract named entities
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")


Apple - ORG
UK - GPE


In [22]:
import nltk
from nltk.util import ngrams

# Example text
text = "I love natural language processing"

# Tokenize the text
tokens = text.split()

# Generate bigrams (n=2)
bigrams = list(ngrams(tokens, 2))

print(bigrams)


[('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing')]


In [10]:
# Example corpus
corpus = [
    "Natural Language Processing is amazing.",
    "Text classification is part of NLP.",
    "Word embeddings are useful in NLP tasks."
]

# Tokenization from scratch
def tokenize_corpus(corpus):
    tokenized = []
    for sentence in corpus:
        # Convert to lowercase, split by whitespace, and remove punctuation
        tokens = sentence.lower().replace('.', '').replace(',', '').split()
        tokenized.append(tokens)
    return tokenized

# Tokenized corpus
tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)


[['natural', 'language', 'processing', 'is', 'amazing'], ['text', 'classification', 'is', 'part', 'of', 'nlp'], ['word', 'embeddings', 'are', 'useful', 'in', 'nlp', 'tasks']]


In [11]:
# Basic stemmer
def simple_stemmer(word):
    suffixes = ["ing", "ed", "ly", "es", "s"]
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# Example words
words = ["running", "played", "quickly", "cats", "watches"]

# Apply stemming
stemmed_words = [simple_stemmer(word) for word in words]
print(stemmed_words)


['runn', 'play', 'quick', 'cat', 'watch']


In [12]:
# Simple lemmatizer with a dictionary of base forms
def simple_lemmatizer(word):  
    lemma_dict = {
        "running": "run",
        "better": "good",
        "cats": "cat",
        "played": "play",
        "faster": "fast"
    }
    return lemma_dict.get(word, word)  # Return the word itself if no lemma found

# Example words
words = ["running", "better", "cats", "played"]

# Apply lemmatization
lemmatized_words = [simple_lemmatizer(word) for word in words]
print(lemmatized_words)


['run', 'good', 'cat', 'play']


In [13]:
# Function to generate n-grams
def generate_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i + n]))
    return ngrams

# Example text
text = "I love natural language processing and machine learning"

# Tokenize the text
tokens = text.split()

# Generate bigrams (n=2)
bigrams = generate_ngrams(tokens, 2)
print("Bigrams:", bigrams)

# Generate trigrams (n=3)
trigrams = generate_ngrams(tokens, 3)
print("Trigrams:", trigrams)

bigrams = generate_ngrams(tokens, 1)
print("unigrams:", bigrams)

Bigrams: [('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing'), ('processing', 'and'), ('and', 'machine'), ('machine', 'learning')]
Trigrams: [('I', 'love', 'natural'), ('love', 'natural', 'language'), ('natural', 'language', 'processing'), ('language', 'processing', 'and'), ('processing', 'and', 'machine'), ('and', 'machine', 'learning')]
unigrams: [('I',), ('love',), ('natural',), ('language',), ('processing',), ('and',), ('machine',), ('learning',)]


In [18]:
text = "Hai! freiends, 3 how are you?"
print(text.split())
res = []
tokenn = ""

for words in text.split():
    for char in words:    
        if char.isalnum():  # Check if the character is alphanumeric
            tokenn += char
        else:  # If not alphanumeric, handle the token and special character
            if tokenn:  # Add the current token if it's not empty
                res.append(tokenn)
                tokenn = ""  # Reset the token
            res.append(char)  # Add the non-alphanumeric character as a token
    if tokenn:  # If there's still a token after processing the word
        res.append(tokenn)
        tokenn = ""  # Reset the token for the next word

print(res)


['Hai!', 'freiends,', '3', 'how', 'are', 'you?']
['Hai', '!', 'freiends', ',', '3', 'how', 'are', 'you', '?']
