In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

##Word-Document and Word-Word Matrices

In [2]:
# Example documents (Shakespeare plays)
documents = {
    "As You Like It": "battle good fool wit love forest magic",
    "Twelfth Night": "good fool wit love comedy mistaken identity",
    "Julius Caesar": "battle battle battle good fool war rome politics",
    "Henry V": "battle battle battle battle good wit war king england"
}

In [8]:
def create_term_document_matrix(documents):
  """ Creates term-document matrix
  Rows represent words (terms)
  Columns represent docs
  Cells have the frequencies (counts)"""

  all_the_words = set()

  for doc in documents.values():
    all_the_words.update(doc.split())

  vocab = sorted(all_the_words)
  print(vocab)

  matrix = []

  for word in vocab:
    row = []

    for doc_name, doc_text in documents.items():
      count = doc_text.split().count(word)
      row.append(count)

    matrix.append(row)

  print(matrix)


  df = pd.DataFrame(matrix,
                    index=vocab,
                    columns=list(documents.keys()))

  return df

term_doc_matrix = create_term_document_matrix(documents)
print("\n", term_doc_matrix)

['battle', 'comedy', 'england', 'fool', 'forest', 'good', 'identity', 'king', 'love', 'magic', 'mistaken', 'politics', 'rome', 'war', 'wit']
[[1, 0, 3, 4], [0, 1, 0, 0], [0, 0, 0, 1], [1, 1, 1, 0], [1, 0, 0, 0], [1, 1, 1, 1], [0, 1, 0, 0], [0, 0, 0, 1], [1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 1], [1, 1, 0, 1]]

           As You Like It  Twelfth Night  Julius Caesar  Henry V
battle                 1              0              3        4
comedy                 0              1              0        0
england                0              0              0        1
fool                   1              1              1        0
forest                 1              0              0        0
good                   1              1              1        1
identity               0              1              0        0
king                   0              0              0        1
love                   1              1              0        0
magic 

## TF-IDF


In [56]:
def compute_tf_idf(term_doc_matrix):
  """Compute Term Frequency × Inverse Document Frequency"""

  matrix = term_doc_matrix.values # Convert to numpy.ndarray to apply np.where easily

  n_docs = matrix.shape[1]

  print(np.where(matrix>0, 1,0))

  # Term freq
  tf = np.where(matrix > 0, 1 + np.log10(matrix), 0) #
  print("\nTerm Freq:\n", tf, "\n")

  # Doc freq (words appearing on how many docs)
  df = np.sum(matrix>0, axis=1)
  print("\nDoc Freq:\n", df, "\n")

  # Inverse doc freq
  idf = np.log10(n_docs/df)
  print("\nInverse Doc Freq without log(10):\n", n_docs/df, "\n")

  print("\nInverse Doc Freq (with log(10)):\n", idf, "\n")


  # TF-IDF
  idf = idf[:, np.newaxis] # Flatten idf
  print("\nBroadcasted Inverse Doc Freq to dimensionality (n,1):\n", idf, "\n")

  tf_idf = tf*idf
  print("\nTF-IDF:\n", idf, "\n")

  return pd.DataFrame(tf_idf, index=term_doc_matrix.index, columns=term_doc_matrix.columns)

compute_tf_idf(term_doc_matrix)

[[1 0 1 1]
 [0 1 0 0]
 [0 0 0 1]
 [1 1 1 0]
 [1 0 0 0]
 [1 1 1 1]
 [0 1 0 0]
 [0 0 0 1]
 [1 1 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 1 1]
 [1 1 0 1]]

Term Freq:
 [[1.         0.         1.47712125 1.60205999]
 [0.         1.         0.         0.        ]
 [0.         0.         0.         1.        ]
 [1.         1.         1.         0.        ]
 [1.         0.         0.         0.        ]
 [1.         1.         1.         1.        ]
 [0.         1.         0.         0.        ]
 [0.         0.         0.         1.        ]
 [1.         1.         0.         0.        ]
 [1.         0.         0.         0.        ]
 [0.         1.         0.         0.        ]
 [0.         0.         1.         0.        ]
 [0.         0.         1.         0.        ]
 [0.         0.         1.         1.        ]
 [1.         1.         0.         1.        ]] 


Doc Freq:
 [3 1 1 3 1 4 1 1 2 1 1 1 1 2 3] 


Inverse Doc Freq without log(10):
 [array([1.33333333, 4.        ,

  tf = np.where(matrix > 0, 1 + np.log10(matrix), 0) #


Unnamed: 0,As You Like It,Twelfth Night,Julius Caesar,Henry V
battle,0.124939,0.0,0.18455,0.200159
comedy,0.0,0.60206,0.0,0.0
england,0.0,0.0,0.0,0.60206
fool,0.124939,0.124939,0.124939,0.0
forest,0.60206,0.0,0.0,0.0
good,0.0,0.0,0.0,0.0
identity,0.0,0.60206,0.0,0.0
king,0.0,0.0,0.0,0.60206
love,0.30103,0.30103,0.0,0.0
magic,0.60206,0.0,0.0,0.0


array([3, 1, 1, 3, 1, 4, 1, 1, 2, 1, 1, 1, 1, 2, 3])