In [2]:
import os
import re
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats import entropy
from nltk.corpus import PlaintextCorpusReader

# run "pip install -U shorttext" to install shorttext
# to use shorttext, package "en" need to be downloaded using command "python -m spacy download en"
from shorttext.utils import DocumentTermMatrix, standard_text_preprocessor_1

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


#### Define functions

In [3]:
# run entropy with base 2
def entropy_2(p):
    return entropy(p, base=2)

# find out of the document is a Berardinelli document (name has 4 characters)
def is_bern(doc_name: str):
    return re.search('^\d{4}\.txt$', doc_name)

#### Define system constants

In [4]:
# working directory
pwd: str = os.environ['HOME'] + '/work/assignment/assignment-7'

In [5]:
corpus_root: str = pwd + '/MovieReviews'
bern_file_count: int = 80
sch_file_count: int = 100
total_file_count: int = bern_file_count + sch_file_count

#### Read all the files

In [6]:
files: PlaintextCorpusReader = PlaintextCorpusReader(corpus_root, '.*', encoding='latin-1')
print('Total files:', len(files.fileids()))

file_contents: list = [files.raw(file) for file in files.fileids()]
print(len(file_contents))

Total files: 180


#### Standard_text_preprocessor_1 provides a standard way of text pre-processing:
- removing special characters,
- removing numerals,
- converting all alphabets to lower cases,
- removing stop words, and
- stemming the words (using Porter stemmer).

In [8]:
preprocessor = standard_text_preprocessor_1()
corpus: list = [preprocessor(sentence).split(' ') for sentence in file_contents]

#### Convert the corpus to a document term matrix 
- each row represents a document
- each column represents the number of occurrences for a word

In [9]:
dtm: DocumentTermMatrix = DocumentTermMatrix(corpus, docids = files.fileids())

#### Function to calculate Mutual Information for a given word against the author

In [10]:
def calc_mi(_word: str):
    _word_docs: list = list(dtm.get_token_occurences(_word).keys())
    _bern_doc_count: int = sum([1 for doc in _word_docs if is_bern(doc)])
    _bern_no_doc_count: int = bern_file_count - _bern_doc_count
    _sch_doc_count: int = sum([1 for doc in _word_docs if not is_bern(doc)])
    _sch_no_doc_count: int = sch_file_count - _sch_doc_count
    _matrix = np.reshape((_bern_doc_count, _bern_no_doc_count, _sch_doc_count, _sch_no_doc_count), (2,2))
    _marginal_entropy = entropy_2(np.sum(_matrix, axis = 1))

    _column_prob = np.sum(_matrix, axis = 0)/total_file_count
    _column_entropy = np.apply_along_axis(entropy_2, 0, _matrix)
    _conditional_entropy = sum(_column_prob * _column_entropy)
    _mi = _marginal_entropy - _conditional_entropy
    return _mi

#### Get all distinct words from the corpus & calculate MI & store it in a vector

In [19]:
all_words: list = [word for review in corpus for word in review]
distinct_words: set = set(all_words)
print('Word Set:', len(distinct_words))

Word Set: 10930


In [15]:
word_arr = []
mi_arr = []

for word in distinct_words:
    mi = calc_mi(word)
    word_arr.append(word)
    mi_arr.append(mi)

print('Total MI:', len(mi_arr))

Total MI: 10930


#### Build a dataframe of the words with the MI value

In [16]:
word_mi_tuples: list = list(zip(word_arr, mi_arr))
word_mi_df: DataFrame = pd.DataFrame(word_mi_tuples, columns=['Word', 'MI'])
print(word_mi_df.shape)

(10930, 2)


#### Sort by MI & extract the top 10 words having the highest MI

In [18]:
word_mi_df_sorted = word_mi_df.sort_values(by=['MI'], ascending=False)
word_mi_df_top_10 = word_mi_df_sorted.head(10)
print(word_mi_df_top_10)

             Word        MI
9472     schwartz  0.991076
1429       reserv  0.869352
7268        denni  0.806275
8900       review  0.643569
4936        right  0.490622
8778        howev  0.423902
4556       releas  0.418957
10116        cast  0.392004
704    screenplay  0.385246
7113       produc  0.372456


#### What it would mean for a word to have high MI with the document author?
- Conditional entropy is used to identify syntagmatic relationship between words in a given context, in this case against the author.
- Conditional entropy is absolute & cannot be used to compare across different words.
- Mutual information helps normalize the conditional entropy value across words so it becomes comparable.
- MI indicates a reduction in entropy, & a higher reduction in entropy indicates a strong relationship between the word & the author.
- In the list of top 10 words, the words - schwartz, denni, reserv, review, right all appear with high MI, since all the reviews by Schwartz have the footer text in the reviews with the copy right notice.
- So, the appearance of those words will have a strong indication that the review's author was Schwartz.
