In [94]:
import os
import numpy as np
from scipy.stats import entropy
from nltk.corpus import PlaintextCorpusReader

# run "pip install -U shorttext" to install shorttext
# to use shorttext, package "en" need to be downloaded using command "python -m spacy download en"
from shorttext.utils import DocumentTermMatrix, standard_text_preprocessor_1

import warnings
warnings.filterwarnings("ignore")

In [95]:
# function to run entropy with base 2
def entropy_2(p):
    return entropy(p, base=2)

In [96]:
# working directory
pwd: str = os.environ['HOME'] + '/work/assignment/assignment-7'

In [97]:
corpus_root: str = pwd + '/MovieReviews'
bern_file_count: int = 80

In [98]:
files: PlaintextCorpusReader = PlaintextCorpusReader(corpus_root, '.*', encoding='latin-1')

bern_files: list = files.fileids()[:bern_file_count]
sch_files: list = files.fileids()[bern_file_count:]

bern_file_count = len(bern_files)
sch_file_count: int = len(sch_files)
total_file_count: int = bern_file_count + sch_file_count

print('Total files:', len(files.fileids()), 'Berardinelli files:', bern_file_count, 
      'Schwartz files:', sch_file_count)

Total files: 180 Berardinelli files: 80 Schwartz files: 100


In [99]:
file_contents: list = [files.raw(file) for file in files.fileids()]
print(len(file_contents))

180


#### standard_text_preprocessor_1 provides a standard way of text pre-processing:
- removing special characters,
- removing numerals,
- converting all alphabets to lower cases,
- removing stop words, and
- stemming the words (using Porter stemmer).

In [100]:
preprocessor = standard_text_preprocessor_1()
corpus: list = [preprocessor(sentence).split(' ') for sentence in file_contents]

#### Convert the corpus to a document term matrix 
- each row represents a document
- each column represents the number of occurrences for a word

In [101]:
dtm: DocumentTermMatrix = DocumentTermMatrix(corpus, docids=files.fileids())

#### Check number of occurrence of the word 'director' in each document

In [102]:
director_counts: list = list(dtm.get_token_occurences('director').values())
director_docs: list = list(dtm.get_token_occurences('director').keys())

#### Calculate the entropy for the word 'director'

In [103]:
print('Entropy of `director` in the documents:', entropy_2(director_counts))

Entropy of `director` in the documents: 7.146642117134548


#### Calculate the entropy for the word 'director'

In [104]:
bern_doc_count: int = sum([1 for doc in director_docs if doc in bern_files])
bern_no_doc_count: int = bern_file_count - bern_doc_count
print('# of Berardinelli docs having `director`:', bern_doc_count)

sch_doc_count: int = sum([1 for doc in director_docs if doc in sch_files])
sch_no_doc_count: int = sch_file_count - sch_doc_count
print('# of Schwartz docs having `director`:', sch_doc_count)

# of Berardinelli docs having `director`: 77
# of Schwartz docs having `director`: 87


#### Build a 2x2 matrix with the counts

In [105]:
matrix = np.reshape((bern_doc_count, bern_no_doc_count, sch_doc_count, sch_no_doc_count), (2,2))
print('Matrix of counts:', matrix)

Matrix of counts: [[77  3]
 [87 13]]


#### Calculate the entropy of the document class (marginal entropy)

In [106]:
marginal_entropy = entropy_2(np.sum(matrix, axis=1))
print('Marginal entropy:', marginal_entropy)

Marginal entropy: 0.9910760598382222


#### Calculate the column probabilities

In [107]:
column_prob = np.sum(matrix, axis=0)/total_file_count
print('Column probabilities:', column_prob)

Column probabilities: [0.91111111 0.08888889]


#### Calculate the conditional entropy

In [108]:
column_entropy = np.apply_along_axis(entropy_2, 0, matrix)
print('Column entropy:', column_entropy)

conditional_entropy = sum(column_prob * column_entropy)
print('Conditional entropy:', conditional_entropy)

Column entropy: [0.99731635 0.69621226]
Conditional entropy: 0.9705515397056739


#### Calculate the mutual information between the word 'Director' and the document class

In [109]:
mi = marginal_entropy - conditional_entropy
print('Mutual information:', mi)

Mutual information: 0.02052452013254835
