# lexicon, or lexical resource

is a collection of words and/or phrases along with associated information, such as part-of-speech and sense definitions. Lexical resources are secondary to texts, and are usually created and enriched with the help of texts

text-->my_text

vocabulary of my_text-->vocab = sorted(set(my_text))

counts the frequency of each word in the text-->word_freq = FreqDist(my_text)

# Lexicon terminology

![image.png](attachment:image.png)

# lexical resources included with NLTK

In [7]:
import numpy as np 
import pandas as pd 
import nltk
nltk.download('gutenberg')
nltk.download('words')
nltk.download('nps_chat')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\91890\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\91890\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package nps_chat to
[nltk_data]     C:\Users\91890\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\nps_chat.zip.


True

# find unusual or misspelled words in a text corpus

In [5]:
#computes the vocabulary of a text, 
#then removes all items that occur in an existing wordlist, 
#leaving just the uncommon or misspelled words

def unusual_words(text):
        text_vocab = set(w.lower() for w in text if w.isalpha())
        english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
        unusual = text_vocab.difference(english_vocab) 
        return sorted(unusual)
    
unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))[:5]


['abbeyland', 'abhorred', 'abilities', 'abounded', 'abridgement']

In [9]:
unusual_words(nltk.corpus.nps_chat.words())[:5]

['aaaaaaaaaaaaaaaaa', 'aaahhhh', 'abortions', 'abou', 'abourted']

# stopwords

In [52]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#defining the stop words
sw_nltk = stopwords.words('english')  

#text from which we need to remove stop words
text="When I first met her she was very quiet. She remained quiet during the entire two hour long journey from Stony Brook to New York."

**method 1 to remove stop words from text**

In [75]:
#funtion to remove stop words
def stopwords_remover1(text):
    words = [word for word in text.split() if word.lower() not in sw_nltk]   
    new_text = " ".join(words)
    return new_text

**method 2 to remove stop words from text**

In [90]:
#funtion to remove stop words
def stopwords_remover2(text):
    word_tokens = word_tokenize(text) 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in sw_nltk: 
            filtered_sentence.append(w) 
            
    a=(" ".join(filtered_sentence))
    return a

In [76]:
stopwords_remover1(text)

'first met quiet. remained quiet entire two hour long journey Stony Brook New York.'

In [68]:
stopwords_remover2(text)

'When I first met quiet . She remained quiet entire two hour long journey Stony Brook New York .'

**removing stop words from nlkt corpus**

In [88]:
#removing stop words from nlkt corpus
nltk.download('gutenberg')
def content_fraction(text):
            sw_nltk = nltk.corpus.stopwords.words('english')
            content = [w for w in text if w.lower() not in sw_nltk] 
            return content
            
content_fraction(nltk.corpus.gutenberg.words())[:5]

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\91890\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


['[', 'Emma', 'Jane', 'Austen', '1816']

**method to find fraction of stopwords in the text**


In [80]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\91890\AppData\Roaming\nltk_data...


True

In [81]:
#method to find fraction of stop words in nlkt corpus
def content_fraction(text):
            sw_nltk = nltk.corpus.stopwords.words('english')
            content = [w for w in text if w.lower() not in sw_nltk] 
            return len(content) / len(text)
            
content_fraction(nltk.corpus.reuters.words())

0.735240435097661

In [92]:
#method to find fraction for normal text
def content_fraction(text):
            sw_nltk = nltk.corpus.stopwords.words('english')
            content = [w for w in text if w.lower() not in sw_nltk] 
            return len(content) / len(text)
            
content_fraction(text)

0.6744186046511628