# Word counts
This notebook will show you the most common words in each document in your corpus. It is not optimized for speed, but it will get you the results.

## Set corpus folder name
corpus is the name of the folder where you will store your files (must be a string)

In [None]:
corpus = "corpus"

## The rest of the code

In [None]:
import os, re, nltk, collections

def clean(text):
    text = re.sub(r"<.+?>", "", text)
    text = re.sub("\s+", " ", text)
    return text
    
def tokenize(text):
    text = nltk.word_tokenize(text)
    text = [word.lower() for word in text if word.isalnum()]
    return text

def counts(text):
    uniquewords = set(text)
    countDict = {}
    for word in uniquewords:
        countDict[word] = text.count(word)
    results = sorted(countDict, key=countDict.get, reverse=True)
    counts = [(r, countDict[r]) for r in results]
    return counts

def removestop(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

texts = []
labels = []

for root, dirs, files in os.walk(corpus):
    for fname in files:
        with open(os.path.join(root, fname), 'r', encoding = 'utf8', errors='ignore') as rf:
            texts.append(tokenize(clean(rf.read())))
            labels.append(fname)

for text, label in zip(texts, labels):
    text = removestop(text)
    frequency = counts(text)
    print(frequency[:20])
    