# Small Example

In [81]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the classifications

In [82]:
%load_ext autoreload
%autoreload 2

from src.encoder import Encoder

labeler = Encoder(['action', 'comedy'])
print(labeler)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Encoder(tokens=2, sample=['action', 'comedy'])


## Load the vocabulary

In [83]:
vocabulary = Encoder.open('movie-review-small/aclImdb/imdb.vocab')
print(vocabulary)

Encoder(tokens=7, sample=['fun', 'shoot', 'fast', 'couple', 'love', 'fly', 'furious'])


## Load the train corpus

In [90]:
from src.structures.corpus import Corpus
from src.structures.document import Document

train_corpus = Corpus.open('movie-review-small/aclImdb/train/**/*.txt', vocabulary=vocabulary, verbose=True)

for document, label in zip(train_corpus, train_corpus.labels()):
    print(document, label)

{'fun': 1, 'couple': 1, 'love': 2} comedy
{'couple': 1, 'fly': 1, 'fast': 1, 'fun': 2} comedy
{'fast': 1, 'furious': 1, 'shoot': 1} action
{'fly': 1, 'fast': 1, 'shoot': 1, 'love': 1} action
{'furious': 1, 'shoot': 2, 'fun': 1} action


### Saving the corpus

frequencies+label

In [91]:
train_corpus.write('movie-review-small/aclImdb/train.NB', verbose=True)

## Load the model

In [85]:
from src.model import Model

model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels())
model.summary()

c(C)   | value
-------+------
action |     3
comedy |     2
log_2(p(C)) | value
------------+------
action      |  None
comedy      |  None
c(t,C) | fun | shoot | fast | couple | love | fly | furious
-------+-----+-------+------+--------+------+-----+--------
action |   1 |     4 |    2 |      0 |    1 |   1 |       2
comedy |   3 |     0 |    1 |      2 |    2 |   1 |       0
log_2(P(t|C)) |  fun | shoot | fast | couple | love |  fly | furious
--------------+------+-------+------+--------+------+------+--------
       action | None |  None | None |   None | None | None |    None
       comedy | None |  None | None |   None | None | None |    None


## Load and save the test corpus

In [92]:
test_corpus = Corpus.open('movie-review-small/aclImdb/test/**/*.txt', vocabulary=vocabulary, verbose=True)
test_corpus.write('movie-review-small/aclImdb/test.NB', verbose=True)

for document, label in zip(test_corpus, test_corpus.labels()):
    print(document, label)

{'fast': 1, 'couple': 1, 'shoot': 1, 'fly': 1} action


## Predict

In [87]:
from src.metrics import Metrics

predictions = model.predict(test_corpus, verbose=True, debug=True)
print(f'predict({test_corpus})', predictions, labeler.decode(predictions))
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions), labeler)
print('accuracy', score['accuracy'])
print(score['confusion'])

p(C|d) |              value
-------+-------------------
action | 1.2111111111111112
comedy |                0.9
predict(Corpus(documents=1, tokens=4, words=4))) [0] ['action']
accuracy 1.0
Confusion | action | comedy
----------+--------+-------
   action |      1 |      0
   comedy |      0 |      0


### Check the model again

In [88]:

model.summary()

c(C)   | value
-------+------
action |     3
comedy |     2
log_2(p(C)) |               value
------------+--------------------
action      | -0.7369655941662062
comedy      | -1.3219280948873622
c(t,C) | fun | shoot | fast | couple | love | fly | furious
-------+-----+-------+------+--------+------+-----+--------
action |   1 |     4 |    2 |      0 |    1 |   1 |       2
comedy |   3 |     0 |    1 |      2 |    2 |   1 |       0
log_2(P(t|C)) |  fun |             shoot |               fast |             couple | love |                 fly | furious
--------------+------+-------------------+--------------------+--------------------+------+---------------------+--------
       action | None | -1.84799690655495 | -2.584962500721156 | -4.169925001442313 | None | -3.1699250014423126 |    None
       comedy | None |              -4.0 |               -3.0 | -2.415037499278844 | None |                -3.0 |    None


# Cleaning the text

In [None]:
from src.data import text_helpers

acronyms, smileys, positive_words, negative_words, negations, stopwords = text_helpers(labeler)

This is a raw review from the corpus

```
Airwolf The Movie, A variation on the original 2 part pilot, Yet the movie although shorter, does contain extra footage Unseen in the 2 hour pilot The pilot is much more of a pilot than the movie Where as a pilot movie is normally the same (2 parter combined) But the movie is actually a different edit with extras here and cuts there.<br /><br />Worth a look, even if you have the season 1 DVD set, I'd still pick up a copy of the "movie" It's still in some shops like virgin, Woolworths and the likes of mixed media stores, although it generally needs ordering, But it saves needing to buy online (as many of us still don't do or trust online shopping) but if you look around airwolfs in stores<br /><br />Airwolf was truly 1 of the 80's most under rated shows.<br /><br />A full size Airwolf is currently being re-built for a Helicopter Museum :) Info and work in progress pictures are over at http://Airwolf.org Also with Airwolf Mods for Flashpoint and Flight Sim Games It seams she's finally here to stay :)
```

In [89]:
from src.structures.bag_of_words import BagOfWords

bow = BagOfWords.open(
    'movie-review-HW2/aclImdb/train/pos/6770_10.txt',
    expansions=acronyms,
    replacements={ **smileys, **positive_words, **negative_words, **negations},
    ignored=stopwords,
    verbose=False)
print(bow)

{'airwolf': 4, 'movie,': 1, 'a': 8, 'variation': 1, 'original': 1, 'part': 1, 'pilot,': 1, 'movie': 4, 'shorter,': 1, 'extra': 1, 'footage': 1, 'unseen': 1, 'hour': 1, 'pilot': 4, '(2': 1, 'parter': 1, 'combined)': 1, 'edit': 1, 'extras': 1, 'cuts': 1, 'there.': 1, 'comedy': 8, 'look,': 1, 'season': 1, '1': 2, 'dvd': 1, 'set,': 1, 'pick': 1, 'copy': 1, '"movie"': 1, 'shops': 1, 'virgin,': 1, 'woolworths': 1, 'mixed': 1, 'media': 1, 'stores,': 1, 'generally': 1, 'ordering,': 1, 'saves': 1, 'needing': 1, 'buy': 1, 'online': 2, '(as': 1, '||not||': 1, 'shopping)': 1, 'airwolfs': 1, 'stores': 1, "80's": 1, 'rated': 1, 'shows.': 1, 'full': 1, 'size': 1, 're-built': 1, 'helicopter': 1, 'museum': 1, 'info': 1, 'pictures': 1, '||url||': 1, 'mods': 1, 'flashpoint': 1, 'flight': 1, 'sim': 1, 'games': 1, 'seams': 1, 'finally': 1, 'stay': 1}


# IMDB examples

## Loading and saving training data

This may take a while, depending on the size of the data.

In [93]:
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab')
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train/**/*.txt',
    vocabulary=vocabulary,
    verbose=True
)
print(train_corpus)
train_corpus.write('movie-review-HW2/aclImdb/train-BOW.NB', verbose=True)

test_corpus = Corpus.open('movie-review-HW2/aclImdb/test/**/*.txt',
    vocabulary=vocabulary,
    verbose=True
)
print(test_corpus)
test_corpus.write('movie-review-HW2/aclImdb/test-BOW.NB', verbose=True)

Encoder(tokens=89527, sample=['worships', 'macnee', 'air-conditioning', 'defends', 'protocols', 'filicide'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87884, words=5871238))
1000 documents written
2000 documents written
3000 documents written
4000 documents written
5000 documents written
6000 documents written
7000 documents written
8000 documents written
9000 documents written
10000 documents written
11000 documents written
12000 documents written
13000 documents

## Loading the saved frequencies

In [96]:
labeler = Encoder(['pos', 'neg'])
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab')

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-BOW.NB', frequencies=True, verbose=True)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-BOW.NB', frequencies=True, verbose=True) 
print(test_corpus)

1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87884, words=5871238))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loade

# Training the model

In [99]:
model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

1000 documents fitted
2000 documents fitted
3000 documents fitted
4000 documents fitted
5000 documents fitted
6000 documents fitted
7000 documents fitted
8000 documents fitted
9000 documents fitted
10000 documents fitted
11000 documents fitted
12000 documents fitted
13000 documents fitted
14000 documents fitted
15000 documents fitted
16000 documents fitted
17000 documents fitted
18000 documents fitted
19000 documents fitted
20000 documents fitted
21000 documents fitted
22000 documents fitted
23000 documents fitted
24000 documents fitted
Model(vocabulary=Encoder(tokens=89527, sample=['worships', 'macnee', 'air-conditioning', 'defends', 'protocols', 'filicide']), labeler=Encoder(tokens=2, sample=['pos', 'neg']))


## Predicting

In [100]:
predictions = model.predict(test_corpus, verbose=True, debug=False)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions), labeler)
print(score)

1000 documents predicted
2000 documents predicted
3000 documents predicted
4000 documents predicted
5000 documents predicted
6000 documents predicted
7000 documents predicted
8000 documents predicted
9000 documents predicted
10000 documents predicted
11000 documents predicted
12000 documents predicted
13000 documents predicted
14000 documents predicted
15000 documents predicted
16000 documents predicted
17000 documents predicted
18000 documents predicted
19000 documents predicted
20000 documents predicted
21000 documents predicted
22000 documents predicted
23000 documents predicted
24000 documents predicted
{'accuracy': 0.81464, 'confusion': [[9325, 3175], [1459, 11041]]}
