# Small Example

In [1]:
%load_ext autoreload
%autoreload 2

## Load the classifications

In [2]:
from src.encoder import Encoder

labeler = Encoder(['action', 'comedy'])
print(labeler)

Encoder(tokens=2, sample=['comedy', 'action'])


## Load the vocabulary

In [3]:
vocabulary = Encoder.open('movie-review-small/aclImdb/imdb.vocab')
print(vocabulary)

Encoder(tokens=7, sample=['shoot', 'fun', 'fly', 'love', 'fast', 'couple', 'furious'])


## Load the train corpus

In [9]:
from src.structures.corpus import Corpus
from src.structures.document import Document

train_corpus = Corpus.open('movie-review-small/aclImdb/train/**/*.txt', vocabulary=vocabulary, verbose=True)
print(train_corpus)
for document, label in zip(train_corpus, train_corpus.labels()):
    print(document, label)

Corpus(documents=5, tokens=7, words=20))
{'fun': 1, 'couple': 1, 'love': 2} comedy
{'couple': 1, 'fly': 1, 'fast': 1, 'fun': 2} comedy
{'fast': 1, 'furious': 1, 'shoot': 1} action
{'fly': 1, 'fast': 1, 'shoot': 1, 'love': 1} action
{'furious': 1, 'shoot': 2, 'fun': 1} action


In [11]:
for document in train_corpus.documents():
    print(document.source, document)

movie-review-small/aclImdb/train/comedy/i.txt {'fun': 1, 'couple': 1, 'love': 2}
movie-review-small/aclImdb/train/comedy/iii.txt {'couple': 1, 'fly': 1, 'fast': 1, 'fun': 2}
movie-review-small/aclImdb/train/action/ii.txt {'fast': 1, 'furious': 1, 'shoot': 1}
movie-review-small/aclImdb/train/action/v.txt {'fly': 1, 'fast': 1, 'shoot': 1, 'love': 1}
movie-review-small/aclImdb/train/action/iv.txt {'furious': 1, 'shoot': 2, 'fun': 1}


### Saving the corpus

frequencies+label

In [12]:
train_corpus.write('movie-review-small/aclImdb/train.NB', verbose=True)

## Load the model

In [13]:
from src.model import Model

model = Model(vocabulary, labeler)
model.fit(train_corpus, train_corpus.labels())
model.summary()

c(C)   | value
-------+------
comedy |     2
action |     3
p(C)   | value
-------+------
comedy |  None
action |  None
c(t,C) | shoot | fun | fly | love | fast | couple | furious
-------+-------+-----+-----+------+------+--------+--------
comedy |     0 |   3 |   1 |    2 |    1 |      2 |       0
action |     4 |   1 |   1 |    1 |    2 |      0 |       2
P(t|C) | shoot |  fun |  fly | love | fast | couple | furious
-------+-------+------+------+------+------+--------+--------
comedy |  None | None | None | None | None |   None |    None
action |  None | None | None | None | None |   None |    None


## Load and save the test corpus

In [14]:
test_corpus = Corpus.open('movie-review-small/aclImdb/test/**/*.txt', vocabulary=vocabulary, verbose=True)
test_corpus.write('movie-review-small/aclImdb/test.NB', verbose=True)

for document, label in zip(test_corpus, test_corpus.labels()):
    print(document, label)

{'fast': 1, 'couple': 1, 'shoot': 1, 'fly': 1} action


## Predict

In [16]:
prediction = model.predict(Document('fast,couple,shoot,fly'), debug=True)
print(labeler.decode(prediction), document.source)


p(C|d) |                  value
-------+-----------------------
comedy |  7.324218750000001e-05
action | 0.00017146776406035664
action movie-review-small/aclImdb/test/action/0.txt


In [17]:
from src.metrics import Metrics

predictions = model.predict(test_corpus, verbose=True, debug=True)
print(f'predict({test_corpus})', predictions, labeler.decode(predictions))
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions), labeler)
print('accuracy', score['accuracy'])
print(score['confusion'])

p(C|d) |                  value
-------+-----------------------
comedy |  7.324218750000001e-05
action | 0.00017146776406035664
predict(Corpus(documents=1, tokens=4, words=4))) [1] ['action']
accuracy 1.0
true\predicted | comedy | action
---------------+--------+-------
        comedy |      0 |      0
        action |      0 |      1


### Check the model again

In [57]:

model.summary()

c(C)   | value
-------+------
action |     3
comedy |     2
p(C)   | value
-------+------
action |   0.6
comedy |   0.4
c(t,C) | couple | love | fast | shoot | furious | fly | fun
-------+--------+------+------+-------+---------+-----+----
action |      0 |    1 |    2 |     4 |       2 |   1 |   1
comedy |      2 |    2 |    1 |     0 |       0 |   1 |   3
P(t|C) |              couple | love |                fast |              shoot | furious |                fly |  fun
-------+---------------------+------+---------------------+--------------------+---------+--------------------+-----
action | 0.05555555555555555 | None | 0.16666666666666666 | 0.2777777777777778 |    None | 0.1111111111111111 | None
comedy |              0.1875 | None |               0.125 |             0.0625 |    None |              0.125 | None


# Cleaning the text

In [58]:
from src.data import text_helpers

acronyms, smileys, positive_words, negative_words, negations, stopwords = text_helpers(labeler)

This is a raw review from the corpus

```
Airwolf The Movie, A variation on the original 2 part pilot, Yet the movie although shorter, does contain extra footage Unseen in the 2 hour pilot The pilot is much more of a pilot than the movie Where as a pilot movie is normally the same (2 parter combined) But the movie is actually a different edit with extras here and cuts there.<br /><br />Worth a look, even if you have the season 1 DVD set, I'd still pick up a copy of the "movie" It's still in some shops like virgin, Woolworths and the likes of mixed media stores, although it generally needs ordering, But it saves needing to buy online (as many of us still don't do or trust online shopping) but if you look around airwolfs in stores<br /><br />Airwolf was truly 1 of the 80's most under rated shows.<br /><br />A full size Airwolf is currently being re-built for a Helicopter Museum :) Info and work in progress pictures are over at http://Airwolf.org Also with Airwolf Mods for Flashpoint and Flight Sim Games It seams she's finally here to stay :)
```

In [89]:
from src.structures.bag_of_words import BagOfWords

bow = BagOfWords.open(
    'movie-review-HW2/aclImdb/train/pos/6770_10.txt',
    expansions=acronyms,
    replacements={ **smileys, **positive_words, **negative_words, **negations},
    ignored=stopwords,
    verbose=False)
print(bow)

{'airwolf': 4, 'movie,': 1, 'a': 8, 'variation': 1, 'original': 1, 'part': 1, 'pilot,': 1, 'movie': 4, 'shorter,': 1, 'extra': 1, 'footage': 1, 'unseen': 1, 'hour': 1, 'pilot': 4, '(2': 1, 'parter': 1, 'combined)': 1, 'edit': 1, 'extras': 1, 'cuts': 1, 'there.': 1, 'comedy': 8, 'look,': 1, 'season': 1, '1': 2, 'dvd': 1, 'set,': 1, 'pick': 1, 'copy': 1, '"movie"': 1, 'shops': 1, 'virgin,': 1, 'woolworths': 1, 'mixed': 1, 'media': 1, 'stores,': 1, 'generally': 1, 'ordering,': 1, 'saves': 1, 'needing': 1, 'buy': 1, 'online': 2, '(as': 1, '||not||': 1, 'shopping)': 1, 'airwolfs': 1, 'stores': 1, "80's": 1, 'rated': 1, 'shows.': 1, 'full': 1, 'size': 1, 're-built': 1, 'helicopter': 1, 'museum': 1, 'info': 1, 'pictures': 1, '||url||': 1, 'mods': 1, 'flashpoint': 1, 'flight': 1, 'sim': 1, 'games': 1, 'seams': 1, 'finally': 1, 'stay': 1}


# IMDB examples

## Loading and saving training data

This may take a while, depending on the size of the data.

In [18]:
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab')
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train/**/*.txt',
    vocabulary=vocabulary,
    verbose=True
)
print(train_corpus)
train_corpus.write('movie-review-HW2/aclImdb/train-BOW.NB', verbose=True)

test_corpus = Corpus.open('movie-review-HW2/aclImdb/test/**/*.txt',
    vocabulary=vocabulary,
    verbose=True
)
print(test_corpus)
test_corpus.write('movie-review-HW2/aclImdb/test-BOW.NB', verbose=True)

Encoder(tokens=89527, sample=['virology', 'hi-jacking', 'holidays', 'sachs', 'loring', 'solarisation'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87884, words=5871238))
1000 documents written
2000 documents written
3000 documents written
4000 documents written
5000 documents written
6000 documents written
7000 documents written
8000 documents written
9000 documents written
10000 documents written
11000 documents written
12000 documents written
13000 documents writ

## Loading the saved frequencies

'movie-review-HW2/aclImdb/train/neg/1821_4.txt'

In [19]:
labeler = Encoder(['pos', 'neg'])
print(labeler)
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab')
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-BOW.NB', frequencies=True, verbose=True)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-BOW.NB', frequencies=True, verbose=True) 
print(test_corpus)

Encoder(tokens=2, sample=['neg', 'pos'])
Encoder(tokens=89527, sample=['virology', 'hi-jacking', 'holidays', 'sachs', 'loring', 'solarisation'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87884, words=5871238))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 document

# Training the model

In [20]:
model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

1000 documents fitted
2000 documents fitted
3000 documents fitted
4000 documents fitted
5000 documents fitted
6000 documents fitted
7000 documents fitted
8000 documents fitted
9000 documents fitted
10000 documents fitted
11000 documents fitted
12000 documents fitted
13000 documents fitted
14000 documents fitted
15000 documents fitted
16000 documents fitted
17000 documents fitted
18000 documents fitted
19000 documents fitted
20000 documents fitted
21000 documents fitted
22000 documents fitted
23000 documents fitted
24000 documents fitted
Model(vocabulary=Encoder(tokens=89527, sample=['virology', 'hi-jacking', 'holidays', 'sachs', 'loring', 'solarisation']), labeler=Encoder(tokens=2, sample=['neg', 'pos']))


## Predicting

In [21]:
from src.metrics import Metrics

predictions = model.predict(test_corpus)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions), labeler)
print(score)

{'accuracy': 0.81464, 'confusion': [[11041, 1459], [3175, 9325]]}


In [29]:
print(score['confusion'])

true\predicted |  pos |   neg
---------------+------+------
           pos | 9343 |  3157
           neg | 1531 | 10969


In [27]:
false_negatives = [document.source for document, y_true, y_pred in zip(test_corpus, test_corpus.labels(), labeler.decode(predictions)) if y_true == 'pos' and y_pred == 'neg' ]

In [28]:
false_negatives[0]

'movie-review-HW2/aclImdb/test/pos/3205_9.txt'

In [67]:
text = open('movie-review-HW2/aclImdb/test/neg/12179_2.txt').read()[0:200].replace('joke', '')
document = Document('staff', labeler, vocabulary)
document.label = 'pos'
model.predict(document)
model.labeler.decode(model.predict(document))

'neg'

In [29]:
false_negatives[100]

'movie-review-HW2/aclImdb/test/pos/2823_10.txt'

In [38]:
false_positives = [ (document.label, document.source) for document, y_true, y_pred in zip(test_corpus, test_corpus.labels(), labeler.decode(predictions)) if y_true == 'neg' and y_pred == 'pos' ]

In [40]:
false_positives[0]

('neg', 'movie-review-HW2/aclImdb/test/neg/1821_4.txt')

In [70]:
y_true, filename = false_positives[151]
print(y_true, filename)
open(filename).read()

neg movie-review-HW2/aclImdb/test/neg/240_4.txt


'There must be an error. This movie belongs with "Plan 9", and a lot others as a quite entertaining, silly diversion. You\'ll never accept you like it, yet you will watch it whenever it comes out on TV. It\'s as simple as that.'

In [71]:
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab')
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train/**/*.txt',
    ngrams=2,
    vocabulary=vocabulary,
    verbose=True,
)
print(train_corpus)
train_corpus.write('movie-review-HW2/aclImdb/train-BOW.2ngrams.NB', verbose=True)

test_corpus = Corpus.open('movie-review-HW2/aclImdb/test/**/*.txt',
    ngrams=2,
    vocabulary=vocabulary,
    verbose=True
)
print(test_corpus)
test_corpus.write('movie-review-HW2/aclImdb/test-BOW.2ngrams.NB', verbose=True)

Encoder(tokens=89527, sample=['virology', 'hi-jacking', 'holidays', 'sachs', 'loring', 'solarisation'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=1420894, words=5896238))
1000 documents written
2000 documents written
3000 documents written
4000 documents written
5000 documents written
6000 documents written
7000 documents written
8000 documents written
9000 documents written
10000 documents written
11000 documents written
12000 documents written
13000 documents wr

In [75]:
print(train_corpus)

Corpus(documents=25000, tokens=1420894, words=5896238))


In [78]:
bigram_vocabulary = Encoder(list(train_corpus.frequencies.keys()))

In [80]:
print(bigram_vocabulary)

Encoder(tokens=1420894, sample=['growing sadly', 'overall horror', '84 and', 'older shows', 'was dirtying', 'old re-runs'])


In [85]:
from src.metrics import Metrics

predictions = model.predict(test_corpus)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions), labeler)
print(score)

{'accuracy': 0.87928, 'confusion': [[11532, 968], [2050, 10450]]}


In [82]:
labeler

{'neg', 'pos'}

In [83]:
labeler = Encoder(['pos', 'neg'])
print(labeler)
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab')
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-BOW.2ngrams.NB', frequencies=True, verbose=True)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-BOW.2ngrams.NB', frequencies=True, verbose=True) 
print(test_corpus)

Encoder(tokens=2, sample=['neg', 'pos'])
Encoder(tokens=89527, sample=['virology', 'hi-jacking', 'holidays', 'sachs', 'loring', 'solarisation'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=1420894, words=5896238))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 docume

In [84]:
vocabulary = Encoder(list(train_corpus.frequencies.keys()))
model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

1000 documents fitted
2000 documents fitted
3000 documents fitted
4000 documents fitted
5000 documents fitted
6000 documents fitted
7000 documents fitted
8000 documents fitted
9000 documents fitted
10000 documents fitted
11000 documents fitted
12000 documents fitted
13000 documents fitted
14000 documents fitted
15000 documents fitted
16000 documents fitted
17000 documents fitted
18000 documents fitted
19000 documents fitted
20000 documents fitted
21000 documents fitted
22000 documents fitted
23000 documents fitted
24000 documents fitted
Model(vocabulary=Encoder(tokens=1420894, sample=['growing sadly', 'overall horror', '84 and', 'older shows', 'was dirtying', 'old re-runs']), labeler=Encoder(tokens=2, sample=['neg', 'pos']))
