In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

##Word-Document and Word-Word Matrices

In [86]:
# Example documents (Shakespeare plays)
documents = {
    "As You Like It": "battle good fool wit love forest magic",
    "Twelfth Night": "good fool wit love comedy mistaken identity",
    "Julius Caesar": "battle battle battle good fool war rome politics",
    "Henry V": "battle battle battle battle good wit war king england"
}

In [8]:
def create_term_document_matrix(documents):
  """ Creates term-document matrix
  Rows represent words (terms)
  Columns represent docs
  Cells have the frequencies (counts)"""

  all_the_words = set()

  for doc in documents.values():
    all_the_words.update(doc.split())

  vocab = sorted(all_the_words)
  print(vocab)

  matrix = []

  for word in vocab:
    row = []

    for doc_name, doc_text in documents.items():
      count = doc_text.split().count(word)
      row.append(count)

    matrix.append(row)

  print(matrix)


  df = pd.DataFrame(matrix,
                    index=vocab,
                    columns=list(documents.keys()))

  return df

term_doc_matrix = create_term_document_matrix(documents)
print("\n", term_doc_matrix)

['battle', 'comedy', 'england', 'fool', 'forest', 'good', 'identity', 'king', 'love', 'magic', 'mistaken', 'politics', 'rome', 'war', 'wit']
[[1, 0, 3, 4], [0, 1, 0, 0], [0, 0, 0, 1], [1, 1, 1, 0], [1, 0, 0, 0], [1, 1, 1, 1], [0, 1, 0, 0], [0, 0, 0, 1], [1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 1], [1, 1, 0, 1]]

           As You Like It  Twelfth Night  Julius Caesar  Henry V
battle                 1              0              3        4
comedy                 0              1              0        0
england                0              0              0        1
fool                   1              1              1        0
forest                 1              0              0        0
good                   1              1              1        1
identity               0              1              0        0
king                   0              0              0        1
love                   1              1              0        0
magic 

## TF-IDF


In [58]:
def compute_tf_idf(term_doc_matrix):
  """Compute Term Frequency × Inverse Document Frequency"""

  matrix = term_doc_matrix.values # Convert to numpy.ndarray to apply np.where easily

  n_docs = matrix.shape[1]

  print(np.where(matrix>0, 1,0))

  # Term freq
  tf = np.where(matrix > 0, 1 + np.log10(matrix + 1e-10), 0) # Raw counts can be misleading (100 occurrences isn't 100x more important than 1), for taht reason we compress it with the log10. Also, added small epsilon to avoid log(0) and warning
  print("\nTerm Freq:\n", tf, "\n")

  # Doc freq (words appearing on how many docs)
  df = np.sum(matrix>0, axis=1)
  print("\nDoc Freq:\n", df, "\n")

  # Inverse doc freq
  idf = np.log10(n_docs/df)
  print("\nInverse Doc Freq without log(10):\n", n_docs/df, "\n")

  print("\nInverse Doc Freq (with log(10)):\n", idf, "\n")


  # TF-IDF
  idf = idf[:, np.newaxis] # Flatten idf
  print("\nBroadcasted Inverse Doc Freq to dimensionality (n,1):\n", idf, "\n")

  tf_idf = tf*idf
  print("\nTF-IDF:\n", idf, "\n")

  return pd.DataFrame(tf_idf, index=term_doc_matrix.index, columns=term_doc_matrix.columns)

compute_tf_idf(term_doc_matrix)

[[1 0 1 1]
 [0 1 0 0]
 [0 0 0 1]
 [1 1 1 0]
 [1 0 0 0]
 [1 1 1 1]
 [0 1 0 0]
 [0 0 0 1]
 [1 1 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 1 1]
 [1 1 0 1]]

Term Freq:
 [[1.         0.         1.47712125 1.60205999]
 [0.         1.         0.         0.        ]
 [0.         0.         0.         1.        ]
 [1.         1.         1.         0.        ]
 [1.         0.         0.         0.        ]
 [1.         1.         1.         1.        ]
 [0.         1.         0.         0.        ]
 [0.         0.         0.         1.        ]
 [1.         1.         0.         0.        ]
 [1.         0.         0.         0.        ]
 [0.         1.         0.         0.        ]
 [0.         0.         1.         0.        ]
 [0.         0.         1.         0.        ]
 [0.         0.         1.         1.        ]
 [1.         1.         0.         1.        ]] 


Doc Freq:
 [3 1 1 3 1 4 1 1 2 1 1 1 1 2 3] 


Inverse Doc Freq without log(10):
 [1.33333333 4.         4.      

Unnamed: 0,As You Like It,Twelfth Night,Julius Caesar,Henry V
battle,0.124939,0.0,0.18455,0.200159
comedy,0.0,0.60206,0.0,0.0
england,0.0,0.0,0.0,0.60206
fool,0.124939,0.124939,0.124939,0.0
forest,0.60206,0.0,0.0,0.0
good,0.0,0.0,0.0,0.0
identity,0.0,0.60206,0.0,0.0
king,0.0,0.0,0.0,0.60206
love,0.30103,0.30103,0.0,0.0
magic,0.60206,0.0,0.0,0.0


## Word-Word Co-occurrence (Association)


In [136]:
def create_word_cooccurrence_matrix(documents, window_size=2):
  cooccurrence = defaultdict(lambda: defaultdict(int))

  for doc in documents.values():
    words = doc.split()

    for i, target_word in enumerate(words):# print(i,target_word)

      start = max(0, i - window_size) # Sets the start of the context window so it is not below 0
      end = min(len(words), i + window_size + 1) # Sets the end of the context window so it is not above the last word
      print("Word ", target_word, "has a window with indexes range ", start, "-", end, "\n")

      for j in range(start,end):
        if i != j:                  # Not counting the word with itself

          context_word = words[j]
          cooccurrence[target_word][context_word] += 1

      print("cooccurrence: ", cooccurrence)


  # all_words = []
  # for doc in documents.values():
  #   for word in doc.split(): all_words.append(word)
  # sorted(set(all_words))

  all_words = sorted(set(word for doc in documents.values() for word in doc.split(" ")  )) # Same as above but in a set comprehension
  print("\nall_words: ", all_words)

  matrix = []
  for target_word in all_words:
    row = []
    for ctxt_word in all_words:
      row.append(cooccurrence[target_word][ctxt_word])
    matrix.append(row)

  return pd.DataFrame(matrix, index = all_words, columns = all_words)

cooc_matrix = create_word_cooccurrence_matrix(documents, window_size=2)
cooc_matrix

Word  battle has a window with indexes range  0 - 3 

cooccurrence:  defaultdict(<function create_word_cooccurrence_matrix.<locals>.<lambda> at 0x79faf6e9b1a0>, {'battle': defaultdict(<class 'int'>, {'good': 1, 'fool': 1})})
Word  good has a window with indexes range  0 - 4 

cooccurrence:  defaultdict(<function create_word_cooccurrence_matrix.<locals>.<lambda> at 0x79faf6e9b1a0>, {'battle': defaultdict(<class 'int'>, {'good': 1, 'fool': 1}), 'good': defaultdict(<class 'int'>, {'battle': 1, 'fool': 1, 'wit': 1})})
Word  fool has a window with indexes range  0 - 5 

cooccurrence:  defaultdict(<function create_word_cooccurrence_matrix.<locals>.<lambda> at 0x79faf6e9b1a0>, {'battle': defaultdict(<class 'int'>, {'good': 1, 'fool': 1}), 'good': defaultdict(<class 'int'>, {'battle': 1, 'fool': 1, 'wit': 1}), 'fool': defaultdict(<class 'int'>, {'battle': 1, 'good': 1, 'wit': 1, 'love': 1})})
Word  wit has a window with indexes range  1 - 6 

cooccurrence:  defaultdict(<function create_word_co

Unnamed: 0,battle,comedy,england,fool,forest,good,identity,king,love,magic,mistaken,politics,rome,war,wit
battle,16,0,0,2,0,5,0,0,0,0,0,0,0,0,1
comedy,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1
england,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
fool,2,0,0,0,0,3,0,0,2,0,0,0,1,1,2
forest,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1
good,5,0,0,3,0,0,0,0,0,0,0,0,0,2,3
identity,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
king,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
love,0,1,0,2,1,0,0,0,0,1,1,0,0,0,2
magic,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0


## PPMI (Positive Pointwise Mutual Information)

In [162]:
def compute_ppmi(cooc_matrix, alpha = 0.75):
  """
  Compute PPMI matrix from the Co-occurrence matrix.
  alpha = 0.75: Levy et al. (2015) found that a setting of α = 0.75 improved performance of embeddings on a wide range of tasks
  0.75 increases the probability assigned to rare contexts, and hence lowers their PMI (Pα(c) > P(c) when c is rare).
  """

  matrix = cooc_matrix.values.astype(float)

  total = np.sum(matrix)


  # Joint probabilities P(w,c)
  # Element-wise division by scalar
  p_wc = matrix / total
  print("\n p_wc: \n", p_wc, "\n")

  # Getting marginal probabilities
  p_w = np.sum(matrix, axis=1) / total
  p_c = np.sum(matrix, axis=0) / total # Although it is the same bcs it is a symmetric co-occurrence matrix


  # Smoothing and re-normalizing
  p_c_alpha = np.power(p_c, alpha)
  p_c_alpha = p_c_alpha / np.sum(p_c_alpha)

  for i,j,k in zip(cooc_matrix.index, p_w, p_c_alpha):print(i,j,round(k,4)) ## p_w = p_c because it is a symmetric matrix, with the power to alpha that changes


  # Calculate PMI
  epsilon = 1e-10 #Added small epsilon to avoid division by zero and log(0)

  pmi = np.log2((p_wc + epsilon)/
              (p_w[:,np.newaxis] * p_c_alpha[np.newaxis,:]+epsilon)) # Broadcasting p_w to dimension (1,n)
  print("\n PMI: \n", pmi, "\n")

  # Convert to PPMI (turn negatives to zero)
  ppmi = np.maximum(0,pmi)
  return pd.DataFrame(ppmi, index=cooc_matrix.index, columns=cooc_matrix.columns)

compute_ppmi(cooc_matrix, alpha = 0.75)


 p_wc: 
 [[0.16 0.   0.   0.02 0.   0.05 0.   0.   0.   0.   0.   0.   0.   0.
  0.01]
 [0.   0.   0.   0.   0.   0.   0.01 0.   0.01 0.   0.01 0.   0.   0.
  0.01]
 [0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.01
  0.  ]
 [0.02 0.   0.   0.   0.   0.03 0.   0.   0.02 0.   0.   0.   0.01 0.01
  0.02]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.01 0.   0.   0.   0.
  0.01]
 [0.05 0.   0.   0.03 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.02
  0.03]
 [0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.
  0.  ]
 [0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01
  0.01]
 [0.   0.01 0.   0.02 0.01 0.   0.   0.   0.   0.01 0.01 0.   0.   0.
  0.02]
 [0.   0.   0.   0.   0.01 0.   0.   0.   0.01 0.   0.   0.   0.   0.
  0.  ]
 [0.   0.01 0.   0.   0.   0.   0.01 0.   0.01 0.   0.   0.   0.   0.
  0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.01
  0.  ]
 [0.   0.   0.   0.01 0.   0.   0.   0.   0.

Unnamed: 0,battle,comedy,england,fool,forest,good,identity,king,love,magic,mistaken,politics,rome,war,wit
battle,1.843331,0.0,0.0,0.0,0.0,0.828652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comedy,0.0,0.0,0.0,0.0,0.0,0.0,3.117016,0.0,1.617016,0.0,2.678294,0.0,0.0,0.0,1.178294
england,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.678294,0.0,0.0,0.0,0.0,0.0,2.617016,0.0
fool,0.0,0.0,0.0,0.0,0.0,1.217217,0.0,0.0,1.157584,0.0,0.0,0.0,1.218862,0.157584,0.718862
forest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.032053,3.532053,0.0,0.0,0.0,0.0,1.593331
good,1.049782,0.0,0.0,1.156965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.916576,1.062817
identity,0.0,3.367016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.678294,0.0,0.0,0.0,0.0
king,0.0,0.0,3.532053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.032053,1.593331
love,0.0,1.367016,0.0,1.272442,1.678294,0.0,0.0,0.0,0.0,2.117016,1.678294,0.0,0.0,0.0,1.178294
magic,0.0,0.0,0.0,0.0,3.678294,0.0,0.0,0.0,2.617016,0.0,0.0,0.0,0.0,0.0,0.0
