# Counter
In this activity, you will create a function that preprocesses and outputs a list of the most common words in a corpus.

In [19]:
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from collections import Counter

# Code to download corpora
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\nospm\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nospm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nospm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nospm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
lemmatizer = WordNetLemmatizer()

In [43]:
# Corpus - list of articles about grains
ids = reuters.fileids(categories='grain')
corpus = [reuters.raw(i) for i in ids]

'EGYPT AUTHORIZED TO BUY PL-480 WHEAT - USDA\n  Egypt has been authorized to purchase\n  about 200,000 tonnes of U.S. wheat under an existing PL-480\n  agreement, the U.S. Agriculture Department said.\n      It may buy the wheat, valued at 22.0 mln dlrs, between\n  April 15 and August 31, 1987, and ship it from U.S. ports by\n  September 30, the department said.\n  \n\n'

In [26]:
# Define preprocess function
def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [35]:
processed_list = process_text(corpus[0])
processed_list[:5]

['china', 'daily', 'say', 'vermin', 'eat']

In [39]:
# Define the word_counter function
def word_counter(list): 
    word_counts = Counter(list)
    return dict(word_counts.most_common(20))

In [40]:
word_counter_results = word_counter(processed_list)
word_counter_results

{'china': 4,
 'pct': 4,
 'said': 3,
 'daily': 2,
 'vermin': 2,
 'grain': 2,
 'stock': 2,
 'seven': 2,
 'mln': 2,
 'tonne': 2,
 'paper': 2,
 'waste': 2,
 'storage': 2,
 'preservation': 2,
 'say': 1,
 'eat': 1,
 'survey': 1,
 'province': 1,
 'city': 1,
 'showed': 1}

In [None]:
# Define the bigram counter function
def bigram_counter(corpus): 
    bigram_counts = Counter(ngrams(processed, n=2))