In [2]:
from sklearn.model_selection import KFold
import numpy as np

class CorpusLoader(object):
    def __init__(self, reader, folds=12, shuffle=True, categories=None):
        self.reader = reader
        self.folds = KFold(n_splits=folds,shuffle=shuffle)
        self.files = np.asarray(self.reader.fileids(categories=categories))
        
    def fileids(self, idx=None):
        if idx is None:
            return self.files
    
        return self.files[idx]
    
    def documents(self, idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))
    
    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)
            
            X_test = self.documents(test_index)
            y_test = self.labels(test_index)
            
            yield X_train, X_test, y_train, y_test