# In this chapter, Bya will cover the following recipes:
1. Setting up a custom corpus
2. Creating a wordlist corpus
3. Creating a part-of-speech tagged word corpus 
4. Creating a chunked phrase corpus
5. Creating a categorized text corpus
6. Creating a categorized chunk corpus reader 
7. Lazy corpus loading
8. Creating a custom corpus view
9. Creating a MongoDB-backed corpus reader
10. Corpus editing with file locking

# Introduction
In this chapter, we'll cover how to use corpus readers and create custom corpora. If you want to train your own model, such as a part-of-speech tagger or text classifier, you will need to create a custom corpus to train on.

# 1. Setting up a custom corpus

A **corpus** is a collection of text documents, and **corpora** is the plural of corpus. This comes from the Latin word for body; in this case, a body of text. So a **custom corpus** is really just a bunch of text  les in a directory, often alongside many other directories of text files.

In [4]:
import os, os.path
path = os.path.expanduser('/Users/Bya/nltk_data')
if not os.path.exists(path):
    os.mkdir(path)
os.path.exists(path)

True

In [9]:
import nltk.data
path in nltk.data.path

True

### Load data to `nltk.data`

In [1]:
import nltk.data
nltk.data.load('/Users/Bya/git/predictEPL/NLTK/CookBook NLTK 3.0/mywords.txt',
              format='raw')

b'nltk'

### Loading a YAML file

In [10]:
import nltk
nltk.download()

In [11]:
import nltk.data
# nltk.data.load('synonyms.yaml')

# 2. Creating a wordlist corpus

### read wordlist

In [8]:
from nltk.corpus.reader import WordListCorpusReader
reader = WordListCorpusReader('.', ['wordlist.csv'])
reader.words()

['nltk', 'corpus', 'corpora', 'wordnet']

In [9]:
reader.fileids()

['wordlist.csv']

In [10]:
reader.raw()

'nltk\ncorpus\ncorpora\nwordnet\n'

In [12]:
from nltk.tokenize import line_tokenize
line_tokenize(reader.raw())

['nltk', 'corpus', 'corpora', 'wordnet']

### Names wordlist corpus

In [13]:
from nltk.corpus import names
names.fileids()

['female.txt', 'male.txt']

In [14]:
len(names.words('female.txt'))

5001

In [15]:
len(names.words('male.txt'))

2943

### English words corpus

In [18]:
from nltk.corpus import words
words.fileids()

['en', 'en-basic']

In [20]:
len(words.words('en-basic'))

850

In [21]:
len(words.words('en'))

235886

# 3. Creating a part-of-speech tagged word corpus

In [22]:
from nltk.corpus.reader import TaggedCorpusReader

In [25]:
reader = TaggedCorpusReader('.', r'.*\.pos')

* words()
* sents()
* paras()
* tagged_words()
* tagged_sents()
* tagged_paras()

In [26]:
reader.words()

['The', 'expense', 'and', 'time', 'involved', 'are', ...]

In [27]:
reader.tagged_words()

[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ...]

In [28]:
reader.sents()

[['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]

In [29]:
reader.tagged_sents()

[[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]]

In [30]:
reader.paras()

[[['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]]

In [32]:
reader.tagged_paras()

[[[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]]]

### Customizing the word tokenizer

In [33]:
from nltk.tokenize import SpaceTokenizer
reader = TaggedCorpusReader('.', r'.*\.pos', sent_tokenizer=SpaceTokenizer())
reader.words()

['The', 'expense', 'and', 'time', 'involved', 'are', ...]

### `tagset='en-brown'`

In [35]:
reader = TaggedCorpusReader('.', r'.*\.pos', tagset='en-brown')
reader.tagged_words(tagset='universal')

[('The', 'DET'), ('expense', 'NOUN'), ('and', 'CONJ'), ...]

In [37]:
from nltk.corpus import treebank
treebank.tagged_words()

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

In [38]:
treebank.tagged_words(tagset='brown')

[('Pierre', 'UNK'), ('Vinken', 'UNK'), (',', 'UNK'), ...]

# 4. Creating a chunked phrase corpus

A `chunk` is a short phrase within a sentence.

In [40]:
from nltk.corpus.reader import ChunkedCorpusReader
reader = ChunkedCorpusReader('.', r'.*\.chunk')
reader.chunked_words()

[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ...]

In [42]:
print(reader.chunked_words())

[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ...]


In [43]:
print(reader.chunked_sents())

[Tree('S', [('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')])]


In [44]:
print(reader.chunked_paras())

[[Tree('S', [('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')])]]


# 5. Creating a categorized text corpus

In [45]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [47]:
from nltk.corpus.reader import CategorizedBracketParseCorpusReader
reader = CategorizedBracketParseCorpusReader('.', r'movie_.*\.txt',
                                            cat_pattern=r'movie_(\w+)\.txt')

In [48]:
reader.categories()

['neg', 'pos']

In [49]:
reader.fileids(categories=['neg'])

['movie_neg.txt']

In [50]:
reader.fileids(categories=['pos'])

['movie_pos.txt']

or

In [52]:
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader('.', r'movie_.*\.txt',
   cat_map={'movie_pos.txt': ['pos'], 'movie_neg.txt': ['neg']})

In [53]:
reader.categories()

['neg', 'pos']