# NLTK corpus reader

In [1]:
# Imports
import os
from   nltk.corpus.reader import PlaintextCorpusReader

# Where are the corpus texts on your system
text_dir = os.path.join('..', 'data', 'texts')

# Regex to identify text files
text_pattern = '.+\.txt'

# Initialize the corpus reader
plain_corpus = PlaintextCorpusReader(text_dir, text_pattern)

In [2]:
plain_corpus.fileids()

['prompt000_000_4045.txt',
 'prompt000_001_9856.txt',
 'prompt000_002_228.txt',
 'prompt000_003_126.txt',
 'prompt000_004_101.txt',
 'prompt000_005_23.txt',
 'prompt000_006_21.txt',
 'prompt000_007_25.txt',
 'prompt000_008_9.txt',
 'prompt000_009_10.txt',
 'prompt000_010_9.txt',
 'prompt000_011_7.txt',
 'prompt000_012_10.txt',
 'prompt000_013_5.txt',
 'prompt000_014_6.txt',
 'prompt000_015_4.txt',
 'prompt000_016_4.txt',
 'prompt000_017_3.txt',
 'prompt000_018_3.txt',
 'prompt000_019_3.txt',
 'prompt000_020_2.txt',
 'prompt000_021_2.txt',
 'prompt000_022_2.txt',
 'prompt000_023_2.txt',
 'prompt000_024_2.txt',
 'prompt000_025_3.txt',
 'prompt000_026_1.txt',
 'prompt001_000_5075.txt',
 'prompt001_001_268.txt',
 'prompt001_002_55.txt',
 'prompt001_003_34.txt',
 'prompt001_004_17.txt',
 'prompt001_005_6.txt',
 'prompt001_006_12.txt',
 'prompt001_007_2.txt',
 'prompt001_008_0.txt',
 'prompt002_000_4831.txt',
 'prompt002_001_700.txt',
 'prompt002_002_154.txt',
 'prompt002_003_47.txt',
 'prom

In [3]:
# Tab completion
plain_corpus.fileids

<bound method CorpusReader.fileids of <PlaintextCorpusReader in '/mnt/c/Users/Owner/bashDir/school/text_mining/project/data/texts'>>

In [19]:
# More imports, in addition to those above
from   glob import glob

# We're going to read just the file names to create the category map
file_paths = glob(os.path.join(text_dir, '*.txt')) # glob lets us use wildcards in paths
file_names = [os.path.split(i)[1] for i in file_paths] # split filenames from paths

category_map = {} # Dict to hold filename:[categories] mappings

for file in file_names:
    parsed = file.rstrip('.txt').split('_') # strip extension and split on hyphens
    prompt_num = str(parsed[0][6:9])
    score = str(parsed[2])
    category_map[file] = [prompt_num, score, ' '.join([prompt_num,score])]

In [10]:
DOC_PATTERN = '.+\.txt'         # Documents are just files that end in '.txt'
CAT_PATTERN = r'([a-z_\s]+)/.*' # We won't use this, but fall back to directory-based labels
                                # if no other labels are supplied

import codecs
import nltk.data
from   nltk.tokenize import *
from   nltk.corpus.reader.util import *
from   nltk.corpus.reader.api import *
from   nltk.corpus.reader import PlaintextCorpusReader

class TMNCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
    """
    A corpus reader for categorized text documents to enable preprocessing.
    """
    
    def __init__(
        self, 
        root, 
        fileids=DOC_PATTERN,
        word_tokenizer=WordPunctTokenizer(),
        sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle'),
        para_block_reader=read_blankline_block,
        encoding='utf8', 
        **kwargs
    ):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        PlaintextCorpusReader.__init__(self, root, fileids, encoding)
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._para_block_reader = para_block_reader

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. Implemented similarly to
        the NLTK ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of a document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()

    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)
            
    # Code below this line is extra, not (yet) covered in the textbook.
    # You can leave it as-is. It provides some standard corpus methods.
    # We're using PlaintextCorpusReader methods, but providing category resolution
    def raw(self, fileids=None, categories=None):
        """
        Returns raw text as a string.
        """
        return PlaintextCorpusReader.raw(self, self.resolve(fileids, categories))

    def words(self, fileids=None, categories=None):
        """
        Returns a list of words.
        """
        return PlaintextCorpusReader.words(self, self.resolve(fileids, categories))

    def sents(self, fileids=None, categories=None):
        """
        Returns a list of tokenized sentences.
        """
        return PlaintextCorpusReader.sents(self, self.resolve(fileids, categories))

    def paras(self, fileids=None, categories=None):
        """
        Returns a list of tokenized sentences.
        """
        return PlaintextCorpusReader.paras(self, self.resolve(fileids, categories))

In [20]:
corpus = TMNCorpusReader(text_dir, cat_map=category_map)

print("\nThe first five fileids:\n", corpus.fileids()[:5])

print("\nTotal words in the corpus:")
print(len(corpus.words()))


The first five fileids:
 ['prompt000_000_4045.txt', 'prompt000_001_9856.txt', 'prompt000_002_228.txt', 'prompt000_003_126.txt', 'prompt000_004_101.txt']

Total words in the corpus:
14577064
