<a href="https://colab.research.google.com/github/abhinavbammidi1401/ADA/blob/main/Text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
text = ''' Hey there! 😃 I can't believe it's already 2024. Did you see John's new blog post? Check it out at https://example.com/blog! Also, email me at john.doe@example.com. He mentioned something about "stemming" and "lemmatization"—interesting stuff. BTW, I'll be attending the AI conference in N.Y.C. next month!! #Excited #AI 😊 Let's catch up soon. Cheers, John'''

In [11]:
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
def preprocess_text(text):
  text = text.lower()

  text = re.sub(r'\S+@\S+', '', text)

  text = re.sub(r'https\S+|www\S+http\S+', '', text)

  text = re.sub(r'[^\w\s]', '', text)

  tokens = word_tokenize(text)

  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  stemmer = PorterStemmer()
  tokens_stemmed = [stemmer.stem(word) for word in tokens]

  lemmatizer = WordNetLemmatizer()
  tokens_lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

  doc = nlp(' '.join(tokens))
  tokens_spacy_lemmatized = [token.lemma_ for token in doc]

  return{
      'original': text,
      'tokens': tokens,
      'stemmed': tokens_stemmed,
      'lemmatized': tokens_lemmatized,
      'spacy_lemmatized': tokens_spacy_lemmatized
  }

In [14]:
preprocessed_text = preprocess_text(text)

In [15]:
print("Original Text:")
print(preprocessed_text['original'])
print("\nTokens:")
print(preprocessed_text['tokens'])
print("\nStemmed Tokens:")
print(preprocessed_text['stemmed'])
print("\nLemmatized Tokens:")
print(preprocessed_text['lemmatized'])
print("\nSpacy Lemmatized Tokens:")
print(preprocessed_text['spacy_lemmatized'])

Original Text:
 hey there  i cant believe its already 2024 did you see johns new blog post check it out at  also email me at  he mentioned something about stemming and lemmatizationinteresting stuff btw ill be attending the ai conference in nyc next month excited ai  lets catch up soon cheers john

Tokens:
['hey', 'cant', 'believe', 'already', '2024', 'see', 'johns', 'new', 'blog', 'post', 'check', 'also', 'email', 'mentioned', 'something', 'stemming', 'lemmatizationinteresting', 'stuff', 'btw', 'ill', 'attending', 'ai', 'conference', 'nyc', 'next', 'month', 'excited', 'ai', 'lets', 'catch', 'soon', 'cheers', 'john']

Stemmed Tokens:
['hey', 'cant', 'believ', 'alreadi', '2024', 'see', 'john', 'new', 'blog', 'post', 'check', 'also', 'email', 'mention', 'someth', 'stem', 'lemmatizationinterest', 'stuff', 'btw', 'ill', 'attend', 'ai', 'confer', 'nyc', 'next', 'month', 'excit', 'ai', 'let', 'catch', 'soon', 'cheer', 'john']

Lemmatized Tokens:
['hey', 'cant', 'believe', 'already', '2024', 