# stopwords

In [10]:
tweet = """I’m amazed how often in practice, not only does a @huggingface NLP model solve your problem, but one of their public finetuned checkpoints, is good enough for the job.

Both impressed, and a little disappointed how rarely I get to actually train a model that matters :("""

In [2]:
from nltk.corpus import stopwords

stopwords=stopwords.words('english')
stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [4]:
len(stopwords)

179

# Now we have a list of stopwords. When we process our text data we will iterate through each word, if it is present in stop_words it will be removed. To optimize the speed of the stopword lookup we can convert stop_words to a set object.

# first we need  to lowercase  our text . then we use our input  text into list of tokens( each token separated is a word   separated by space)

In [7]:
stop_words=set(stopwords)

In [6]:
tweet=tweet.lower().split()
tweet

['i’m',
 'amazed',
 'how',
 'often',
 'in',
 'practice,',
 'not',
 'only',
 'does',
 'a',
 '@huggingface',
 'nlp',
 'model',
 'solve',
 'your',
 'problem,',
 'but',
 'one',
 'of',
 'their',
 'public',
 'finetuned',
 'checkpoints,',
 'is',
 'good',
 'enough',
 'for',
 'the',
 'job.',
 'both',
 'impressed,',
 'and',
 'a',
 'little',
 'disappointed',
 'how',
 'rarely',
 'i',
 'get',
 'to',
 'actually',
 'train',
 'a',
 'model',
 'that',
 'matters',
 ':(']

# Now we have a list of stopwords. When we process our text data we will iterate through each word, if it is present in stop_words it will be removed. To optimize the speed of the stopword lookup we can convert stop_words to a set object.

In [9]:
tweet_no_stopwords=[word for word in tweet if word not in stop_words]
print("with stopwords\n ",''. join(tweet))
print("without \n :",''.join(tweet_no_stopwords))

with stopwords
  i’mamazedhowofteninpractice,notonlydoesa@huggingfacenlpmodelsolveyourproblem,butoneoftheirpublicfinetunedcheckpoints,isgoodenoughforthejob.bothimpressed,andalittledisappointedhowrarelyigettoactuallytrainamodelthatmatters:(
without 
 : i’mamazedoftenpractice,@huggingfacenlpmodelsolveproblem,onepublicfinetunedcheckpoints,goodenoughjob.impressed,littledisappointedrarelygetactuallytrainmodelmatters:(


# tokens

In [1]:
tweet = """I’m amazed how often in practice, not only does a @huggingface NLP model solve your problem, but one of their public finetuned checkpoints, is good enough for the job.

Both impressed, and a little disappointed how rarely I get to actually train a model that matters :("""

tweet.split()

['I’m',
 'amazed',
 'how',
 'often',
 'in',
 'practice,',
 'not',
 'only',
 'does',
 'a',
 '@huggingface',
 'NLP',
 'model',
 'solve',
 'your',
 'problem,',
 'but',
 'one',
 'of',
 'their',
 'public',
 'finetuned',
 'checkpoints,',
 'is',
 'good',
 'enough',
 'for',
 'the',
 'job.',
 'Both',
 'impressed,',
 'and',
 'a',
 'little',
 'disappointed',
 'how',
 'rarely',
 'I',
 'get',
 'to',
 'actually',
 'train',
 'a',
 'model',
 'that',
 'matters',
 ':(']

In [2]:
[char for char  in tweet][:30]

['I',
 '’',
 'm',
 ' ',
 'a',
 'm',
 'a',
 'z',
 'e',
 'd',
 ' ',
 'h',
 'o',
 'w',
 ' ',
 'o',
 'f',
 't',
 'e',
 'n',
 ' ',
 'i',
 'n',
 ' ',
 'p',
 'r',
 'a',
 'c',
 't',
 'i']

# advantage of character leve tokens over word  tokens 

# disadvantage of charcater lvel tkens over word tokens 
# It's not all good news for character-level embeddings though. Words carry a significant level of semantic meaning,
# and when we use character-level embedding this is mostly lost. 
# At a high-level we can view character-level embedding as being good for syntax, 
# and word-level embedding as being better for semantics. Although, in-reality, word-level embeddings 
# almost always outpeform character-level embeddings.

# Back to word-level embeddings, we will often find with the latest transformer models that text can be split into part-word tokens. So for example, we may find that the word 'being' is split into the tokens ["be", "-ing"], or 'amazingly' to ["amaz", "-ing", "-ly"].

# In addition to this, we typically seperate punctuation too, so in our previous example the tokens '@huggingface' and 'impressed,' would become ["@", "huggingface"] and ["impressed", ","] respectively.

# In our tweet we might want to find any token that begins with @ and convert that token to , a unique token that we have specified to identify usernames in our tweets. This rule is logical as there are potentially millions of added tokens in our model if we include Twitter usernames, but the username doesn't tell our model anything about the meaning in the language of the text, for example:

# @elonmusk thinks that the NLP models that @joebloggs made are super cool

# Has no real meaningful difference to our model as with:

# @joebloggs thinks that the NLP models that @huggingface made are super cool

# The meaning and subsequent classification of both tweets should really be identical in our model. So, it is logical to replace usernames with a single shared token. This approach is something that is commonly used for many different things such as:

# emails
# names/usernames
# URLs
# monetary values
# or any other numbers
# But ofcourse we don't always want to do this for everything, this is simply a rough guide as to what we may want to tokenize.

# STEMMING 

In [3]:
txt = "I am amazed by how amazingly amazing you are"

# Difference between porterstemmer vs LancasterStemmer

In [4]:
words_to_stem = ['happy', 'happiest', 'happier', 'cactus', 'cactii', 'elephant', 'elephants', 'amazed', 'amazing', 'amazingly', 'cement', 'owed', 'maximum']

In [5]:
from nltk.stem import PorterStemmer, LancasterStemmer

porter = PorterStemmer()
lancaster = LancasterStemmer()

In [6]:
porter_stemmed = [porter.stem(word) for word in words_to_stem]

In [7]:
lancaster_stemmed=[lancaster.stem(word) for word in words_to_stem]

In [8]:
porter_stemmed

['happi',
 'happiest',
 'happier',
 'cactu',
 'cactii',
 'eleph',
 'eleph',
 'amaz',
 'amaz',
 'amazingli',
 'cement',
 'owe',
 'maximum']

In [9]:
lancaster_stemmed

['happy',
 'happiest',
 'happy',
 'cact',
 'cacti',
 'eleph',
 'eleph',
 'amaz',
 'amaz',
 'amaz',
 'cem',
 'ow',
 'maxim']

# Lemmatization¶

In [10]:
words = ['amaze', 'amazed', 'amazing']

In [11]:
import nltk 
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer


lemmatizer=WordNetLemmatizer()

[lemmatizer.lemmatize(word) for word in  words]

[nltk_data] Downloading package wordnet to C:\Users\Asmita
[nltk_data]     Chatterjee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['amaze', 'amazed', 'amazing']

# Clearly nothing has happened, and that is because lemmatization requires that we also provide the parts-of-speech (POS) tag, which is the category of a word based on syntax. For example noun, adjective, or verb. In our case we could place each word as a verb, which we can then implement like so:

In [12]:
from nltk.corpus import wordnet 
    
    
[lemmatizer.lemmatize(word,wordnet.VERB) for word in  words]    

['amaze', 'amaze', 'amaze']