<a href="https://colab.research.google.com/github/YUVARAJC14/DSA0328-Natural-language-Processing/blob/main/Q23-Evaluate-the-Coherences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

def calculate_text_coherence(text):
  """
  Calculates the coherence of a given text using cosine similarity between word frequencies.

  Args:
    text: The input text string.

  Returns:
    A float representing the coherence score (0 to 1), with higher values indicating higher coherence.
  """

  # Tokenize the text and remove stop words
  tokens = word_tokenize(text.lower())
  stop_words = set(stopwords.words('english'))
  filtered_tokens = [w for w in tokens if not w in stop_words]

  # Calculate word frequencies
  freq_dist = FreqDist(filtered_tokens)

  # Create a word-frequency vector
  word_vector = np.array([freq_dist[word] for word in freq_dist])

  # Calculate cosine similarity between word frequencies
  # (This measures how similar the frequency distribution is to itself,
  #  effectively measuring the consistency of word usage)
  coherence_score = cosine_similarity(word_vector.reshape(1, -1), word_vector.reshape(1, -1))[0][0]

  return coherence_score


# Example usage
text = "The cat sat on the mat. The cat was black and fluffy. The mat was blue."
coherence = calculate_text_coherence(text)
print(f"Coherence score: {coherence}")

Coherence score: 1.0000000000000002


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
