# NBC training

#### Reading of files

In [1]:
from glob import glob
import csv

In [2]:
data = []
values = {'neg', 'pos'}

In [3]:
filenames = glob('/training_data/*.csv')
for file in filenames:
    with open(file) as f:
        file_reader = csv.reader(f)
        next(file_reader, None)
        for row in file_reader:
            if row[0] in values:
                data.append(row)

In [4]:
len(data)

16194

### Building the vocabulary

In [5]:
import re
from string import punctuation
from itertools import chain
from nltk.corpus import stopwords
from nltk.stem import (LancasterStemmer, PorterStemmer, SnowballStemmer,
                       WordNetLemmatizer)

##### Set stemming algorithm

In [7]:
stemmer_name = 'snowball'
stem = lambda x: x
if stemmer_name == 'porter':
    stemmer = PorterStemmer()
    stem = stemmer.stem
elif stemmer_name == 'lancaster':
    stemmer = LancasterStemmer()
    stem = stemmer.stem
elif stemmer_name == 'snowball':
    stemmer = SnowballStemmer('english')
    stem = stemmer.stem
elif stemmer_name == 'lemmatizer':
    stemmer = WordNetLemmatizer()
    stem = lambda w: stemmer.lemmatize(w, pos='v')

##### Set of words to exclude

In [8]:
noisywords = set(
    stopwords.words("english") +
    list(punctuation) +
    ["__mention__", "__link__"]
)

##### Map of data by value

In [9]:
training_data = {}
for vj in values:
    filtered_data = filter(lambda x: x[0] == vj, data)
    training_data[vj] = list(map(lambda x: x[1], filtered_data))

all_training_data = list(chain(*training_data.values()))

##### Vocabulary

In [27]:
def get_document_words(*documents, unique=True):
    text = ' '.join(documents).lower()
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '__link__', text)
    text = re.sub(r'@[^\s]+', '__mention__', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    words = re.findall(r'[^\d\W]{2,}', text)
    words = filter(lambda w: w not in noisywords, words)
    words = map(stem, words)

    return list(set(words)) if unique else list(words)

vocabulary = get_document_words(*all_training_data)

In [29]:
# Just testing the stemmer
stem('friendly')

'friend'

## Training algorithm

In [31]:
from math import sqrt

In [15]:
P = {'_': dict.fromkeys(values)}
P.update(dict((w, {}) for w in vocabulary))

In [32]:
for vj in values:
    docs_j = training_data[vj]
    P['_'][vj] = len(docs_j) / len(all_training_data)
    all_text_j = ' '.join(docs_j)
    words_j = get_document_words(all_text_j, unique=False)
    n = len(words_j)
    for w in vocabulary:
        w_occurrences = words_j.count(w)
        P[w][vj] = sqrt(
            (w_occurrences + 1) / (n + len(vocabulary))
        )

## Classification logic
Test in memory

In [43]:
in_vocabulary = lambda w: w in vocabulary

def classify_nbc_text(doc):
    if not doc:
        raise "Error: The document is not valid."

    p = P['_'].copy()
    for w in filter(in_vocabulary, get_document_words(doc)):
        for vj in values:
            p[vj] *= P[w][vj]

    return max(p, key=p.get), p

### Testing

In [74]:
assert classify_nbc_text('friend') == classify_nbc_text('friendly')
assert classify_nbc_text('succcess') == classify_nbc_text('succcessful')
assert classify_nbc_text('danger') == classify_nbc_text('dangerous')

## Elasticsearch

In [6]:
from elasticsearch import Elasticsearch

In [8]:
es = Elasticsearch(hosts='es')

### Storage/Indexing

In [94]:
base_doc = {
    'stemmer_name': stemmer_name,
}
for word, p in P.items():
    total_factor = sum(v for k, v in p.items() if k in values)
    factors = {}
    for vj in values:
        factors[vj] = p[vj] / total_factor
    
    doc = {
        **base_doc,
        'value': word,
        'p': factors,
    }
    es.index(index='words', id=f'{stemmer_name}_{word}', body=doc)

### Querying

In [17]:
payload = {
    'query': {
        'match': {
            'value.keyword': 'macgraw'
        }
    }
}
res = es.search(index='words', body=payload)

In [18]:
res

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 9.284446,
  'hits': [{'_index': 'words',
    '_type': '_doc',
    '_id': 'snowball_macgraw',
    '_score': 9.284446,
    '_source': {'stemmer_name': 'snowball',
     'value': 'macgraw',
     'p': {'pos': 0.40242084643626364, 'neg': 0.5975791535637365}}}]}}

## Classification
Using ES