In [6]:
%load_ext autoreload
%autoreload 2

import importlib
from src.encoder import Encoder

labeler = Encoder(['action', 'comedy'])
print(labeler)
vocabulary = Encoder.open('movie-review-small/aclImdb/imdb.vocab')
print(vocabulary)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Encoder(tokens=2, sample=['action', 'comedy'])
Encoder(tokens=7, sample=['furious', 'love', 'fast', 'fly', 'fun', 'shoot', 'couple'])


In [7]:
import importlib
from src.structures.corpus import Corpus
from src.structures.document import Document

train_corpus = Corpus.open('movie-review-small/aclImdb/train/**/*.txt', vocabulary=vocabulary, verbose=True)
train_corpus.write('movie-review-small/aclImdb/train.NB', verbose=True)
print(train_corpus, train_corpus.labels())

Corpus(documents=5, tokens=7, words=20)) ['comedy', 'comedy', 'action', 'action', 'action']


In [8]:
import importlib
from src.model import Model

model = Model(vocabulary, labeler)
model.fit(train_corpus, train_corpus.labels())
model.summary()

c(C)   | value
-------+------
action |     3
comedy |     2
p(C)   | value
-------+------
action |  None
comedy |  None
c(t,C) | furious | love | fast | fly | fun | shoot | couple
-------+---------+------+------+-----+-----+-------+-------
action |       2 |    1 |    2 |   1 |   1 |     4 |      0
comedy |       0 |    2 |    1 |   1 |   3 |     0 |      2
P(t|C) | furious | love | fast |  fly |  fun | shoot | couple
-------+---------+------+------+------+------+-------+-------
action |    None | None | None | None | None |  None |   None
comedy |    None | None | None | None | None |  None |   None


In [9]:
test_corpus = Corpus.open('movie-review-small/aclImdb/test/**/*.txt', vocabulary=vocabulary, verbose=True)
test_corpus.write('movie-review-small/aclImdb/test.NB', verbose=True)
print(test_corpus, test_corpus.labels())

Corpus(documents=1, tokens=4, words=4)) ['action']


In [12]:
from src.metrics import Metrics

predictions = model.predict(test_corpus, verbose=True)
print(f'predict({test_corpus})', predictions, labeler.decode(predictions))
Metrics.score(test_corpus.labels(), labeler.decode(predictions))

model.summary()

predict(Corpus(documents=1, tokens=4, words=4))) [0] ['action']
c(C)   | value
-------+------
action |     3
comedy |     2
p(C)   | value
-------+------
action |   0.6
comedy |   0.4
c(t,C) | furious | love | fast | fly | fun | shoot | couple
-------+---------+------+------+-----+-----+-------+-------
action |       2 |    1 |    2 |   1 |   1 |     4 |      0
comedy |       0 |    2 |    1 |   1 |   3 |     0 |      2
P(t|C) | furious | love |                fast |                fly |  fun |              shoot |              couple
-------+---------+------+---------------------+--------------------+------+--------------------+--------------------
action |    None | None | 0.16666666666666666 | 0.1111111111111111 | None | 0.2777777777777778 | 0.05555555555555555
comedy |    None | None |               0.125 |              0.125 | None |             0.0625 |              0.1875


Airwolf The Movie, A variation on the original 2 part pilot, Yet the movie although shorter, does contain extra footage Unseen in the 2 hour pilot The pilot is much more of a pilot than the movie Where as a pilot movie is normally the same (2 parter combined) But the movie is actually a different edit with extras here and cuts there.<br /><br />Worth a look, even if you have the season 1 DVD set, I'd still pick up a copy of the "movie" It's still in some shops like virgin, Woolworths and the likes of mixed media stores, although it generally needs ordering, But it saves needing to buy online (as many of us still don't do or trust online shopping) but if you look around airwolfs in stores<br /><br />Airwolf was truly 1 of the 80's most under rated shows.<br /><br />A full size Airwolf is currently being re-built for a Helicopter Museum :) Info and work in progress pictures are over at http://Airwolf.org Also with Airwolf Mods for Flashpoint and Flight Sim Games It seams she's finally here to stay :)

In [13]:
import csv
from src.structures.bag_of_words import BagOfWords

with open('src/data/acronyms.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    acronyms = { acronym: meaning for acronym, meaning in reader }

with open('src/data/smileys.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    smileys = { smiley: f'{labeler.decode(int(bias))}' for smiley, bias in reader }

with open('src/data/positive-words.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    positive_words = { positive_word: f'{labeler.decode(int(bias))}' for positive_word, bias in reader }

with open('src/data/negative-words.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    negative_words = { negative_word: f'{labeler.decode(int(bias))}' for negative_word, bias in reader }

with open('src/data/negation.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    negations = { negation: token for negation, token in reader }

with open('src/data/stopwords.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)
    stopwords = { word for [word] in reader }

bow = BagOfWords.open('movie-review-HW2/aclImdb/train/pos/6770_10.txt', expansions=acronyms, replacements={ **smileys, **positive_words, **negative_words, **negations}, ignored=stopwords, verbose=False)
print(bow)

{'airwolf': 4, 'movie,': 1, 'a': 8, 'variation': 1, 'original': 1, 'part': 1, 'pilot,': 1, 'movie': 4, 'shorter,': 1, 'extra': 1, 'footage': 1, 'unseen': 1, 'hour': 1, 'pilot': 4, '(2': 1, 'parter': 1, 'combined)': 1, 'edit': 1, 'extras': 1, 'cuts': 1, 'there.': 1, 'comedy': 8, 'look,': 1, 'season': 1, '1': 2, 'dvd': 1, 'set,': 1, 'pick': 1, 'copy': 1, '"movie"': 1, 'shops': 1, 'virgin,': 1, 'woolworths': 1, 'mixed': 1, 'media': 1, 'stores,': 1, 'generally': 1, 'ordering,': 1, 'saves': 1, 'needing': 1, 'buy': 1, 'online': 2, '(as': 1, '||not||': 1, 'shopping)': 1, 'airwolfs': 1, 'stores': 1, "80's": 1, 'rated': 1, 'shows.': 1, 'full': 1, 'size': 1, 're-built': 1, 'helicopter': 1, 'museum': 1, 'info': 1, 'pictures': 1, '||url||': 1, 'mods': 1, 'flashpoint': 1, 'flight': 1, 'sim': 1, 'games': 1, 'seams': 1, 'finally': 1, 'stay': 1}


In [26]:
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab') #, include=['||pos||', '||neg||', '||url||', '||email||', '||not||'])
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train/**/*.txt',
    vocabulary=vocabulary,
    # replacements= {**smileys, **negative_words, **positive_words, **negations },
    # expansions=acronyms,
    # ignored=stopwords,
    verbose=True
)
print(train_corpus)
train_corpus.write('movie-review-HW2/aclImdb/train-BOW.NB', verbose=True)

test_corpus = Corpus.open('movie-review-HW2/aclImdb/test/**/*.txt',
    vocabulary=vocabulary,
    # replacements= {**smileys, **negative_words, **positive_words, **negations },
    # expansions=acronyms,
    # ignored=stopwords,
    verbose=True
)
print(test_corpus)
test_corpus.write('movie-review-HW2/aclImdb/test-BOW.NB', verbose=True)

Encoder(tokens=89527, sample=['cinematographical', 'unday', 'infusing', 'popcorn', 'sharecropper', 'militia-style'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87884, words=5871238))
1000 documents written
2000 documents written
3000 documents written
4000 documents written
5000 documents written
6000 documents written
7000 documents written
8000 documents written
9000 documents written
10000 documents written
11000 documents written
12000 documents written
13000 d

In [14]:
labeler = Encoder(['pos', 'neg'])
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab') #, include=['||pos||', '||neg||', '||url||', '||email||', '||not||'])

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-BOW.NB', frequencies=True, verbose=True)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-BOW.NB', frequencies=True, verbose=True) 
print(test_corpus)

model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

predictions = model.predict(test_corpus, verbose=True, debug=False)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions))
print(score)

1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87884, words=5871238))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loade

In [19]:
labeler = Encoder(['pos', 'neg'])

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-BOW.NB', frequencies=True, verbose=True)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-BOW.NB', frequencies=True, verbose=True) 
print(test_corpus)

vocabulary = Encoder(list(train_corpus.frequencies.keys()))#, include=['||pos||', '||neg||', '||url||', '||email||', '||not||'])

model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

predictions = model.predict(test_corpus, verbose=True, debug=False)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions))
print(score)

1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87884, words=5871238))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loade

In [20]:
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab', include=['||pos||', '||neg||', '||url||', '||email||', '||not||'])
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train/**/*.txt',
    vocabulary=vocabulary,
    replacements= {**smileys, **negative_words, **positive_words, **negations },
    expansions=acronyms,
    ignored=stopwords,
    verbose=True
)
print(train_corpus)
train_corpus.write('movie-review-HW2/aclImdb/train-modified.NB')

test_corpus = Corpus.open('movie-review-HW2/aclImdb/test/**/*.txt',
    vocabulary=vocabulary,
    replacements= {**smileys, **negative_words, **positive_words, **negations },
    expansions=acronyms,
    ignored=stopwords,
    verbose=True
)
print(test_corpus)
test_corpus.write('movie-review-HW2/aclImdb/test-modified.NB', verbose=True)

Encoder(tokens=89532, sample=['artful', 'extirpate', 'muffin', 'snug', 'grue', 'barnaby'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=86055, words=2880487))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded

In [21]:
labeler = Encoder(['pos', 'neg'])

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-modified.NB', frequencies=True, verbose=True)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-modified.NB', frequencies=True, verbose=True) 
print(test_corpus)

model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

predictions = model.predict(test_corpus, verbose=True, debug=False)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions))
print(score)

1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=86055, words=2880487))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loade

In [22]:
labeler = Encoder(['pos', 'neg'])

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-modified.NB', frequencies=True, verbose=True)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-modified.NB', frequencies=True, verbose=True) 
print(test_corpus)

vocabulary = Encoder(list(train_corpus.frequencies.keys()))#, include=['||pos||', '||neg||', '||url||', '||email||', '||not||'])

model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

predictions = model.predict(test_corpus, verbose=True, debug=False)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions))
print(score)

1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=86055, words=2880487))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loade

In [23]:
vocabulary = Encoder.open('movie-review-HW2/aclImdb/imdb.vocab', include=['||pos||', '||neg||', '||url||', '||email||', '||not||'])
print(vocabulary)

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train/**/*.txt',
    vocabulary=vocabulary,
    # replacements= {**smileys, **negative_words, **positive_words, **negations },
    # expansions=acronyms,
    ignored=stopwords,
    verbose=True
)
print(train_corpus)
train_corpus.write('movie-review-HW2/aclImdb/train-stopwords.NB')

test_corpus = Corpus.open('movie-review-HW2/aclImdb/test/**/*.txt',
    vocabulary=vocabulary,
    # replacements= {**smileys, **negative_words, **positive_words, **negations },
    # expansions=acronyms,
    ignored=stopwords,
    verbose=True
)
print(test_corpus)
test_corpus.write('movie-review-HW2/aclImdb/test-stopwords.NB', verbose=True)

Encoder(tokens=89532, sample=['artful', 'extirpate', 'muffin', 'snug', 'grue', 'barnaby'])
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded
15000 documents loaded
16000 documents loaded
17000 documents loaded
18000 documents loaded
19000 documents loaded
20000 documents loaded
21000 documents loaded
22000 documents loaded
23000 documents loaded
24000 documents loaded
Corpus(documents=25000, tokens=87819, words=2775270))
1000 documents loaded
2000 documents loaded
3000 documents loaded
4000 documents loaded
5000 documents loaded
6000 documents loaded
7000 documents loaded
8000 documents loaded
9000 documents loaded
10000 documents loaded
11000 documents loaded
12000 documents loaded
13000 documents loaded
14000 documents loaded

In [24]:
labeler = Encoder(['pos', 'neg'])

train_corpus = Corpus.open('movie-review-HW2/aclImdb/train-stopwords.NB', frequencies=True, verbose=False)
print(train_corpus)
test_corpus = Corpus.open('movie-review-HW2/aclImdb/test-stopwords.NB', frequencies=True, verbose=False) 
print(test_corpus)

model = Model(vocabulary, labeler, log=True)
model.fit(train_corpus, train_corpus.labels(), verbose=True)
print(model)

predictions = model.predict(test_corpus, verbose=True, debug=False)
score = Metrics.score(test_corpus.labels(), labeler.decode(predictions))
print(score)

Corpus(documents=25000, tokens=87819, words=2775270))
Corpus(documents=25000, tokens=77910, words=2723924))
1000 documents fitted
2000 documents fitted
3000 documents fitted
4000 documents fitted
5000 documents fitted
6000 documents fitted
7000 documents fitted
8000 documents fitted
9000 documents fitted
10000 documents fitted
11000 documents fitted
12000 documents fitted
13000 documents fitted
14000 documents fitted
15000 documents fitted
16000 documents fitted
17000 documents fitted
18000 documents fitted
19000 documents fitted
20000 documents fitted
21000 documents fitted
22000 documents fitted
23000 documents fitted
24000 documents fitted
Model(vocabulary=Encoder(tokens=89532, sample=['artful', 'extirpate', 'muffin', 'snug', 'grue', 'barnaby']), labeler=Encoder(tokens=2, sample=['neg', 'pos']))
1000 documents predicted
2000 documents predicted
3000 documents predicted
4000 documents predicted
5000 documents predicted
6000 documents predicted
7000 documents predicted
8000 documents 