# Several implementations of tokenizers, text processing and indexing

In [1]:
import nltk
import pattern.en
import pattern.it
import langdetect as ld
import string
from collections import defaultdict, Counter
from nltk.stem.snowball import SnowballStemmer
import numpy as np

In [None]:
class Tokenizer(object):
    
    def __init__(self, preserve_case=True):
        self.tweet = nltk.tokenize.TweetTokenizer(preserve_case=preserve_case)
        self.lang_map = defaultdict(lambda: (SnowballStemmer('english'), pattern.en.parsetree))
        self.lang_map['en'] = (SnowballStemmer('english'), pattern.en.parsetree)
        self.lang_map['it'] = (SnowballStemmer('italian'), pattern.it.parsetree)
    
    @staticmethod
    def lang(doc):
        try:
            lang = ld.detect(doc)
        except Exception:
            lang = 'en'
        return lang
    
    @staticmethod
    def remove_punctuation(tokens, special_chars=None):
        p = string.punctuation
        if special_chars is not None:
            p += "".join(special_chars)
        return [x for x in tokens if x not in p]
    
    def stemming(self, tokens):
        lang = Tokenizer.lang(" ".join(tokens))
        stemmer = self.lang_map[lang][0]
        stems = [stemmer.stem(t) for t in tokens]
        return stems
    
    def tweet_tokenizer(self, doc):
        return self.tweet.tokenize(doc)
    
    def pattern_processing(self, doc, lemmata=False):
        p = self.lang_map[Tokenizer.lang(doc)][1]
        tree = p(doc, lemmata=lemmata)
        tokens, lemmata = [], []
        for sentence in tree:
            for word in sentence.words:
                tokens.append(word.string)
                lemmata.append(word.lemma)
        return tokens, lemmata

class MIndex(defaultdict):
    
    def __init__(self):
        super(MIndex, self).__init__(lambda: [])
        self.docs = set()
    
    def boolean(self, doc_id, tokens):
        self.docs.add(doc_id)
        for token in set(tokens):
            self[token].append(doc_id)
    
    def boolean_to_matrix(self):
        features = list(self.keys())
        docs = list(self.docs)
        M = np.zeros((len(docs), len(features)))
        for token, posting in self.items():
            for doc in posting:
                ti, di = features.index(token), docs.index(doc)
                M[di][ti] = 1
        return M > 0, features, docs