In [1]:
import os
from glob import glob
import numpy as np

def texts_labels_from_folders(path, folders):
    """
    Used when data is in the format of /data/neg/example1.txt and /data/neg/example2.txt
    path is the base path
    folders is a list of folders, one per label
    
    i.e. path = 'data'
         folders = ['pos', 'neg']
         
         This will get all files inside /data/pos/ and /data/neg/
    
    """
    texts, labels = [], []
    for idx, label in enumerate(folders):
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(label)
    return texts, labels

In [2]:
path = 'data'
folders = ['pos','neg']

texts, labels = texts_labels_from_folders(path, folders)

In [3]:
import pandas as pd

def texts_labels_from_file(path, header=True):
    """
    Used for when data is in a single csv or tsv file with one example per line and one label (TODO: multi-label)
    
    path is the location of the csv/tsv file
    header is a bool which indicates if the file has a header or not
    
    """
    
    if header:
        data = pd.read_csv(path)
        texts = data[data.columns.values[0]].values
        labels = data[data.columns.values[1]].values
        return texts, labels
    
    else:
        data = pd.read_csv(path)
        texts = data[0].values
        labels = data[1].values
        return texts, labels

In [4]:
path = 'data/train.txt'

texts, labels = texts_labels_from_file(path)

In [5]:
print(texts, labels)

['good is good' 'bad is bad' 'good is great' 'great is great'
 'bad is sucks' 'sucks'] [' pos' ' neg' ' pos' ' pos' ' neg' ' neg']


In [6]:
import spacy

def spacy_tokenize(texts):
    """
    Turn a list of strings into a list of lists. Each sub-list is the tokenized representation of the string.
    """
    
    tokenizer = spacy.load('en')
    temp = []
    for text in texts:
        temp.append([tok.text for tok in tokenizer(text)])
    return temp

In [7]:
print(spacy_tokenize(texts))

[['good', 'is', 'good'], ['bad', 'is', 'bad'], ['good', 'is', 'great'], ['great', 'is', 'great'], ['bad', 'is', 'sucks'], ['sucks']]


In [114]:
class Vocab:
    def __init__(self, max_words=None, min_count=None, unk_token='<UNK>', pad_token='<PAD>'):
        
        self.max_words = max_words
        self.min_count = min_count
        self.unk_token = unk_token
        self.pad_token = pad_token
        
        self.word2idx = {}
        self.idx2word = {}
        self.word_count = {}
        self.size = 0
        
        if self.pad_token is not None:
            self.add(self.pad_token)
        
        if self.unk_token is not None:
            self.add(self.unk_token)
            
    def add(self, token):
        if token not in self.word2idx:
            self.word2idx[token] = self.size
            self.idx2word[self.size] = token
            self.word_count[token] = 1
            self.size += 1
        else:
            self.word_count[token] += 1
            
    def build(self, texts):
        """
        Takes a list of tokenized texts, i.e. a list of list of strings, [['one','two','three], ['four,'five','six']]
        Builds the vocabulary
        """
        
        #build the mapping and counts
        for text in texts:
            for token in text:
                assert token != self.pad_token, "Padding token in dataset?"
                assert token != self.unk_token, "Unknown token in dataset?"
                self.add(token)
                
        #holds words we want to get rid of
        words_to_unk = set()
        
        #finds words that aren't in the top N words
        if self.max_words is not None:
            most_common = sorted(self.word_count.items(), key=lambda x: -x[1])
            for (word, count) in most_common[self.max_words:]:
                words_to_unk.add(word)
        
        #finds words that appear less than N times
        if self.min_count is not None:
            most_common = sorted(self.word_count.items(), key=lambda x: x[1])
            for (word, count) in most_common:
                if count < self.min_count:
                    words_to_unk.add(word)
                else:
                    break
        
        if self.unk_token is not None:
            words_to_unk.remove(self.unk_token)
            
        if self.pad_token is not None:
            words_to_unk.remove(self.pad_token)
            
        #only bother doing this if we have unknowns to handle
        if len(words_to_unk)>0:
            
            self.word2idx = {}
            self.idx2word = {}
            self.word_count = {}
            self.size = 0

            if self.pad_token is not None:
                self.add(self.pad_token)

            if self.unk_token is not None:
                self.add(self.unk_token)

            for text in texts:
                for token in text:
                    if token not in words_to_unk:
                        self.add(token)
                    
    def numericalize(self, texts):
        
        self.build(texts)
        
        temp_texts = []
        
        for text in texts:
            temp_texts.append([self.word2idx.get(token, self.word2idx[self.unk_token]) for token in text])
    
        return temp_texts

In [121]:
vocab = Vocab(min_count=3)

In [122]:
tokenized_texts = spacy_tokenize(texts)

In [123]:
vocab.numericalize(tokenized_texts)

{'<PAD>', '<UNK>', 'sucks'}
{'sucks'}


[[2, 3, 2], [4, 3, 4], [2, 3, 5], [5, 3, 5], [4, 3, 1], [1]]

In [124]:
vocab.word_count

{'<PAD>': 1, '<UNK>': 1, 'bad': 3, 'good': 3, 'great': 3, 'is': 5}

In [125]:
vocab.idx2word

{0: '<PAD>', 1: '<UNK>', 2: 'good', 3: 'is', 4: 'bad', 5: 'great'}

In [120]:
my_set = set()

In [103]:
my_set == False

False