# Tokenisation

In [None]:
import re

def segment_text(input_text):
    token_pattern = r"""
    (?:[A-Za-z]\.){2,}[A-Z]
    | \w+(?:-\w+)+
    | \b([A-Za-z]+)(n't|'s|'ll|'em|'ve|'re|'d)\b
    | \b\w+\b
    | [.,!?;"()\[\]{}<>]
    """

    segments = []  # List to store tokens

    for match in re.finditer(token_pattern, input_text, flags=re.VERBOSE):
        if match.group(1):
            segments.extend([match.group(1), match.group(2)])
        else:
            segments.append(match.group(0))

    return segments

text = "isn't"
unique_segments = set(segment_text(text))
print(unique_segments)

{"n't", 'is'}


# Is Plural - FST

In [None]:
def is_plural_noun_accepted_fsa(word):
    if len(word) < 2 or word[-1] != 's':
        return False

    word = word[::-1]
    state = 'S1'

    for char in word[1:]:
        if state == 'S1':
            if char == 'y':
                state = 'S2'
            elif char == 'e':
                state = 'S3'
            else:
                return False
        elif state == 'S2':
            if char in 'aeiou':
                state = 'S5'
            else:
                return False
        elif state == 'S3':
            if char == 'i':
                state = 'S4'
            else:
                return False
        elif state == 'S4':
            if char.isalpha() and char not in 'aeiou':
                state = 'S6'
            else:
                return False
        elif state == 'S5':
            continue
        elif state == 'S6':
            continue

    return True

test_words = ['boys', 'toys', 'ponies', 'skies', 'puppies', 'boies', 'toies', 'ponys', 'carries', 'daisies']
results = {word: is_plural_noun_accepted_fsa(word) for word in test_words}

print(results)

{'boys': True, 'toys': True, 'ponies': True, 'skies': True, 'puppies': True, 'boies': False, 'toies': False, 'ponys': False, 'carries': True, 'daisies': True}


# Pluralising

In [None]:
def pluralize_word_fst(word):
    state = 'START'
    result = word

    if word.endswith("^s#"):
        if word.endswith("x^s#"):
            state = 'ADD_ES'
        elif word.endswith("s^s#"):
            state = 'ADD_ES'
        elif word.endswith("z^s#"):
            state = 'ADD_ES'
        else:
            state = 'ADD_S'
    else:
        return word

    if state == 'ADD_ES':
        result = word.replace("^s#", "es")
    elif state == 'ADD_S':
        result = word.replace("^s#", "s")

    return result

test_cases = ["fox^s#", "boy^s#", "bus^s#", "quiz^s#", "dog^s#"]
results = {word: pluralize_word_fst(word) for word in test_cases}
print(results)

{'fox^s#': 'foxes', 'boy^s#': 'boys', 'bus^s#': 'buses', 'quiz^s#': 'quizes', 'dog^s#': 'dogs'}


# Minimum Edit Distance

In [None]:
class EditDistance:
    def __init__(self):
        self.INSERT_COST = 1
        self.DELETE_COST = 1
        self.REPLACE_COST = 2

    def minimum_edit_distance(self, source: str, target: str):
        """
        Calculate minimum edit distance between source and target strings.
        Returns the distance and the operations needed.
        """
        m, n = len(source), len(target)

        dp = [[0] * (n + 1) for _ in range(m + 1)]
        operations = [[None] * (n + 1) for _ in range(m + 1)]


        for i in range(m + 1):
            dp[i][0] = i * self.DELETE_COST
            print(dp)
            if i > 0:
                operations[i][0] = ('DELETE', i-1, 0)

        for j in range(n + 1):
            dp[0][j] = j * self.INSERT_COST
            if j > 0:
                operations[0][j] = ('INSERT', 0, j-1)

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if source[i-1] == target[j-1]:
                    dp[i][j] = dp[i-1][j-1]
                    operations[i][j] = ('COPY', i-1, j-1)
                else:
                    replace = dp[i-1][j-1] + self.REPLACE_COST
                    delete = dp[i-1][j] + self.DELETE_COST
                    insert = dp[i][j-1] + self.INSERT_COST

                    min_cost = min(replace, delete, insert)
                    dp[i][j] = min_cost

                    if min_cost == replace:
                        operations[i][j] = ('REPLACE', i-1, j-1)
                    elif min_cost == delete:
                        operations[i][j] = ('DELETE', i-1, j)
                    else:
                        operations[i][j] = ('INSERT', i, j-1)

        print(dp)

        edit_sequence = []
        i, j = m, n

        while i > 0 or j > 0:
            operation, prev_i, prev_j = operations[i][j]

            if operation == 'COPY':
                edit_sequence.append(f"Copy '{source[i-1]}'")
            elif operation == 'REPLACE':
                edit_sequence.append(f"Replace '{source[i-1]}' with '{target[j-1]}'")
            elif operation == 'DELETE':
                edit_sequence.append(f"Delete '{source[i-1]}'")
            else:
                edit_sequence.append(f"Insert '{target[j-1]}'")

            i, j = prev_i, prev_j

        edit_sequence.reverse()

        return dp[m][n], edit_sequence

    def print_detailed_output(self, source: str, target: str):
        """Print detailed output including the edit distance and operations"""
        distance, operations = self.minimum_edit_distance(source, target)

        print(f"Source string: {source}")
        print(f"Target string: {target}")
        print(f"Minimum Edit Distance: {distance}")
        print("\nEdit Operations:")
        for i, op in enumerate(operations, 1):
            print(f"{i}. {op}")


def test_edit_distance():
    ed = EditDistance()
    ed.print_detailed_output("cat", "cut")

    # # Test cases
    # test_cases = [
    #     ("kitten", "sitting"),
    #     ("sunday", "saturday"),
    #     ("intention", "execution"),
    #     ("cat", "cut"),
    #     ("", "hello"),
    #     ("algorithm", "logarithm"),
    #     ("hello", "hello"),
    # ]

    # print("Testing Minimum Edit Distance Algorithm")
    # print("=" * 50)

    # for source, target in test_cases:
    #     print("\nTest Case:")
    #     print("-" * 50)
    #     ed.print_detailed_output(source, target)
    #     print("-" * 50)

if __name__ == "__main__":
    test_edit_distance()

[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
[[0, 0, 0, 0], [1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
[[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [0, 0, 0, 0]]
[[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0]]
[[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0]]
[[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 2, 3], [3, 2, 3, 2]]
Source string: cat
Target string: cut
Minimum Edit Distance: 2

Edit Operations:
1. Copy 'c'
2. Replace 'a' with 'u'
3. Copy 't'


# Spell Checker - Needs Improvement

# Spell Checker

In [None]:
import nltk
from nltk.corpus import words, brown, gutenberg
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import string
import re
import numpy as np
from sklearn.model_selection import train_test_split

# Download required NLTK data
nltk.download('words')
nltk.download('brown')
nltk.download('gutenberg')
nltk.download('punkt')

class SpellChecker:
    def __init__(self):
        self.vocabulary = set(words.words())
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.trigram_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        self.unigram_counts = defaultdict(int)
        self.word_freq = defaultdict(int)

        common_words = {'misspelled', 'jumps', 'words', 'over'}
        self.vocabulary.update(common_words)

        self.train_model()

    def train_model(self):
        for corpus in [brown, gutenberg]:
            for sentence in corpus.sents():
                tokens = ['<s>', '<s>'] + [word.lower() for word in sentence] + ['</s>']

                for token in tokens:
                    self.word_freq[token.lower()] += 1

                for i in range(len(tokens)-2):
                    self.bigram_counts[tokens[i+1]][tokens[i+2]] += 1
                    self.unigram_counts[tokens[i+1]] += 1

                    self.trigram_counts[tokens[i]][tokens[i+1]][tokens[i+2]] += 1

                self.unigram_counts[tokens[-1]] += 1

    def get_edits1(self, word):
        """Generate all strings that are one edit distance away from the input word"""
        letters = string.ascii_lowercase
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]

        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]

        return set(deletes + transposes + replaces + inserts)

    def get_edits2(self, word):
        """Generate all strings that are two edits away from the input word"""
        return set(e2 for e1 in self.get_edits1(word) for e2 in self.get_edits1(e1))

    def calculate_ngram_probability(self, prev_tokens, token):
        """Calculate probability using interpolated trigram/bigram/unigram models"""
        lambda1, lambda2, lambda3 = 0.5, 0.3, 0.2

        trigram_prob = 0
        if len(prev_tokens) >= 2:
            numerator = self.trigram_counts[prev_tokens[0]][prev_tokens[1]][token] + 1
            denominator = self.bigram_counts[prev_tokens[0]][prev_tokens[1]] + len(self.vocabulary)
            trigram_prob = numerator / denominator

        bigram_prob = 0
        if len(prev_tokens) >= 1:
            numerator = self.bigram_counts[prev_tokens[-1]][token] + 1
            denominator = self.unigram_counts[prev_tokens[-1]] + len(self.vocabulary)
            bigram_prob = numerator / denominator

        unigram_prob = (self.unigram_counts[token] + 1) / (sum(self.unigram_counts.values()) + len(self.vocabulary))

        return lambda1 * trigram_prob + lambda2 * bigram_prob + lambda3 * unigram_prob

    def score_candidate(self, candidate, prev_words, next_words):
        """Score a candidate word based on n-gram probability and word frequency"""
        prev_tokens = ['<s>'] if not prev_words else prev_words[-2:]
        prob_score = np.log(self.calculate_ngram_probability(prev_tokens, candidate))

        if next_words:
            next_prob = np.log(self.calculate_ngram_probability([prev_tokens[-1], candidate], next_words[0]))
            prob_score += next_prob

        freq_score = np.log(self.word_freq[candidate.lower()] + 1)

        if prev_words and next_words:
            avg_len = (len(prev_words[-1]) + len(next_words[0])) / 2
            len_score = -abs(len(candidate) - avg_len) / 10
        else:
            len_score = 0

        return prob_score + 0.5 * freq_score + len_score

    def correct_word(self, word, prev_words, next_words):
        """Correct a single word using context"""
        if word.lower() in self.vocabulary:
            return word

        # Generate candidates
        candidates = self.get_edits1(word.lower())
        valid_candidates = {c for c in candidates if c in self.vocabulary}

        if not valid_candidates:
            candidates2 = self.get_edits2(word.lower())
            valid_candidates = {c for c in candidates2 if c in self.vocabulary}

        if not valid_candidates:
            return word

        best_candidate = max(valid_candidates,
                           key=lambda x: self.score_candidate(x, prev_words, next_words))

        if word[0].isupper():
            best_candidate = best_candidate.capitalize()

        return best_candidate

    def correct_text(self, text):
        words = word_tokenize(text)
        corrected_words = []

        for i, word in enumerate(words):
            if word.isalpha():
                prev_words = [w.lower() for w in words[max(0, i-2):i]]
                next_words = [w.lower() for w in words[i+1:min(len(words), i+3)]]

                corrected_word = self.correct_word(word, prev_words, next_words)
                corrected_words.append(corrected_word)
            else:
                corrected_words.append(word)

        return " ".join(corrected_words)

def test_spell_checker():
    spell_checker = SpellChecker()

    test_cases = [
        "This is a test sentense with misspeled words",
        "I recieved your mesage yestarday",
        "The quick brwn fox jumps ovr the lasy dog",
        "She was writting a letter to her frend"
    ]

    print("Spell Checker Test Results:")
    print("-" * 50)
    for text in test_cases:
        corrected_text = spell_checker.correct_text(text)
        print(f"Original:  {text}")
        print(f"Corrected: {corrected_text}")
        print("-" * 50)

if __name__ == "__main__":
    print("Testing Improved Spell Checker:")
    test_spell_checker()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Testing Improved Spell Checker:
Spell Checker Test Results:
--------------------------------------------------
Original:  This is a test sentense with misspeled words
Corrected: This is a test sentence with misspelled words
--------------------------------------------------
Original:  I recieved your mesage yestarday
Corrected: I received your message yesterday
--------------------------------------------------
Original:  The quick brwn fox jumps ovr the lasy dog
Corrected: The quick brown fox jumps or the last dog
--------------------------------------------------
Original:  She was writting a letter to her frend
Corrected: She was writing a letter to her friend
--------------------------------------------------



# Sentiment Analysis

In [None]:
from nltk.corpus import movie_reviews

In [None]:
import numpy as np

In [None]:
import re
import string
import numpy as np
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from collections import defaultdict
from sklearn.model_selection import train_test_split

class NaiveBayesSentimentClassifier:
    def __init__(self, k=1.0):
        self.k = k  # Smoothing parameter
        self.word_counts = {'pos': defaultdict(int), 'neg': defaultdict(int)}
        self.class_counts = {'pos': 0, 'neg': 0}
        self.vocabulary = set()

    def preprocess(self, text):
        """Preprocess the text by converting to lowercase and removing punctuation"""
        text = text.lower()
        text = re.sub(f'[{string.punctuation}]', '', text)
        return word_tokenize(text)

    def train(self, X_train, y_train):
        """Train the classifier on the given data"""
        for text, label in zip(X_train, y_train):
            words = self.preprocess(text)
            self.class_counts[label] += 1

            for word in words:
                self.word_counts[label][word] += 1
                self.vocabulary.add(word)

    def calculate_probability(self, text, label):
        """Calculate P(text|label) using the Naive Bayes assumption"""
        words = self.preprocess(text)
        log_prob = np.log(self.class_counts[label] / sum(self.class_counts.values()))

        vocab_size = len(self.vocabulary)
        total_words = sum(self.word_counts[label].values())

        for word in words:
            count = self.word_counts[label].get(word, 0)
            prob = (count + self.k) / (total_words + self.k * vocab_size)
            log_prob += np.log(prob)

        return log_prob

    def predict(self, text):
        """Predict the sentiment of the given text"""
        pos_prob = self.calculate_probability(text, 'pos')
        neg_prob = self.calculate_probability(text, 'neg')

        return 'pos' if pos_prob > neg_prob else 'neg'

    def evaluate(self, X_test, y_test):
        """Evaluate the classifier on test data"""
        correct = 0
        total = len(X_test)

        for text, true_label in zip(X_test, y_test):
            pred_label = self.predict(text)
            if pred_label == true_label:
                correct += 1

        return correct / total

def test_sentiment_classifier():
    documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]

    np.random.shuffle(documents)

    texts = [' '.join(doc) for doc, category in documents]
    labels = [category for doc, category in documents]

    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

    k_values = [0.25, 0.75, 1.0]

    for k in k_values:
        classifier = NaiveBayesSentimentClassifier(k=k)
        classifier.train(X_train, y_train)
        accuracy = classifier.evaluate(X_test, y_test)
        print(f"Accuracy with k={k}: {accuracy:.4f}")

test_sentiment_classifier()

Accuracy with k=0.25: 0.7975
Accuracy with k=0.75: 0.8000
Accuracy with k=1.0: 0.8050


In [None]:
import numpy as np

# Transition probabilities (a_ij)
a_ij = {
    'START': {'NN': 0.5, 'VB': 0.25, 'JJ': 0.25, 'RB': 0},
    'NN': {'STOP': 0.25, 'NN': 0.25, 'VB': 0.5, 'JJ': 0, 'RB': 0},
    'VB': {'STOP': 0.25, 'NN': 0.25, 'VB': 0, 'JJ': 0.25, 'RB': 0.25},
    'JJ': {'STOP': 0, 'NN': 0.75, 'VB': 0, 'JJ': 0.25, 'RB': 0},
    'RB': {'STOP': 0.5, 'NN': 0.25, 'VB': 0, 'JJ': 0.25, 'RB': 0},
}

# Emission probabilities (b_ik)
b_ik = {
    'NN': {'time': 0.1, 'flies': 0.01, 'fast': 0.01},
    'VB': {'time': 0.01, 'flies': 0.1, 'fast': 0.01},
    'JJ': {'time': 0, 'flies': 0, 'fast': 0.1},
    'RB': {'time': 0, 'flies': 0, 'fast': 0.1},
}

# POS Tags and Words
states = ['NN', 'VB', 'JJ', 'RB']
sentence = ['time', 'flies', 'fast']

# Viterbi Algorithm Implementation
def viterbi(sentence, states, start_prob, transition_prob, emission_prob):
    T = len(sentence)
    N = len(states)

    # Initialization
    V = np.zeros((T, N))  # Viterbi matrix
    backpointer = np.zeros((T, N), dtype=int)  # Backpointer to reconstruct path

    # Initialize with START probabilities
    for i, state in enumerate(states):
        V[0, i] = start_prob[state] * emission_prob[state].get(sentence[0], 0)

    # Recursion step
    for t in range(1, T):
        for j, state_j in enumerate(states):
            max_prob, max_state = 0, 0
            for i, state_i in enumerate(states):
                prob = V[t-1, i] * transition_prob[state_i].get(state_j, 0) * emission_prob[state_j].get(sentence[t], 0)
                if prob > max_prob:
                    max_prob, max_state = prob, i
            V[t, j] = max_prob
            backpointer[t, j] = max_state

    # Termination: Transition to STOP
    final_probs = [V[T-1, i] * transition_prob[states[i]].get('STOP', 0) for i in range(N)]
    best_final_state = np.argmax(final_probs)

    # Backtracking
    best_path = [best_final_state]
    for t in range(T-1, 0, -1):
        best_path.insert(0, backpointer[t, best_path[0]])

    # Convert state indices to state names
    best_path_states = [states[state] for state in best_path]
    return best_path_states, max(final_probs)

# Start probabilities from START
def get_start_prob(states):
    return {state: a_ij['START'].get(state, 0) for state in states}

# Run Viterbi
start_prob = get_start_prob(states)
most_likely_tags, max_prob = viterbi(sentence, states, start_prob, a_ij, b_ik)

# Output Results
print("Sentence:", sentence)
print("Most Likely POS Tags:", most_likely_tags)
print("Probability of Best Path:", max_prob)

Sentence: ['time', 'flies', 'fast']
Most Likely POS Tags: ['NN', 'VB', 'RB']
Probability of Best Path: 3.125000000000001e-05


In [None]:
from collections import defaultdict
import re

# Function to preprocess text (simple tokenization)
def preprocess(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    return text.split()

# Function to calculate bigram probabilities
def calculate_bigrams(corpus):
    unigram_counts = defaultdict(int)
    bigram_counts = defaultdict(lambda: defaultdict(int))

    # Tokenize corpus and calculate counts
    for sentence in corpus:
        tokens = ['<s>'] + preprocess(sentence) + ['</s>']
        for i in range(len(tokens) - 1):
            unigram_counts[tokens[i]] += 1
            bigram_counts[tokens[i]][tokens[i + 1]] += 1
        unigram_counts[tokens[-1]] += 1  # Count </s> as a unigram

    # Calculate bigram probabilities
    bigram_probs = defaultdict(dict)
    for w1 in bigram_counts:
        for w2 in bigram_counts[w1]:
            bigram_probs[w1][w2] = bigram_counts[w1][w2] / unigram_counts[w1]

    return bigram_probs

# Function to calculate sentence probability using bigram model
def sentence_probability(sentence, bigram_probs):
    tokens = ['<s>'] + preprocess(sentence) + ['</s>']
    prob = 1.0

    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i + 1]
        if w2 in bigram_probs.get(w1, {}):
            prob *= bigram_probs[w1][w2]
        else:
            prob *= 0  # If bigram doesn't exist, probability is 0

    return prob

# Example usage
if __name__ == "__main__":
    corpus = [
        "The cat sat on the mat",
        "The cat ate the mouse",
        "The dog barked loudly"
    ]

    # Calculate bigram probabilities
    bigram_probs = calculate_bigrams(corpus)

    # Print bigram probabilities
    print("Bigram Probabilities:")
    for w1 in bigram_probs:
        for w2 in bigram_probs[w1]:
            print(f"P({w2} | {w1}) = {bigram_probs[w1][w2]:.4f}")

    # Test sentence probability
    test_sentence = "The cat sat"
    prob = sentence_probability(test_sentence, bigram_probs)
    print(f"\nProbability of sentence '{test_sentence}': {prob:.8f}")

Bigram Probabilities:
P(the | <s>) = 1.0000
P(cat | the) = 0.4000
P(mat | the) = 0.2000
P(mouse | the) = 0.2000
P(dog | the) = 0.2000
P(sat | cat) = 0.5000
P(ate | cat) = 0.5000
P(on | sat) = 1.0000
P(the | on) = 1.0000
P(</s> | mat) = 1.0000
P(the | ate) = 1.0000
P(</s> | mouse) = 1.0000
P(barked | dog) = 1.0000
P(loudly | barked) = 1.0000
P(</s> | loudly) = 1.0000

Probability of sentence 'The cat sat': 0.00000000


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute TF-IDF matrix
def compute_tfidf(documents):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names, vectorizer

# Function to calculate cosine similarity between two documents
def cosine_similarity_docs(tfidf_matrix, doc_index1, doc_index2):
    return cosine_similarity(tfidf_matrix[doc_index1], tfidf_matrix[doc_index2])[0][0]

# Function to calculate cosine similarity between two words
def cosine_similarity_words(word1, word2, vectorizer, feature_names, tfidf_matrix):
    if word1 not in feature_names or word2 not in feature_names:
        return 0.0

    word_index1 = np.where(feature_names == word1)[0][0]
    word_index2 = np.where(feature_names == word2)[0][0]

    word_vector1 = tfidf_matrix[:, word_index1].toarray().flatten()
    word_vector2 = tfidf_matrix[:, word_index2].toarray().flatten()

    return cosine_similarity(word_vector1.reshape(1, -1), word_vector2.reshape(1, -1))[0][0]

# Example usage
if __name__ == "__main__":
    documents = [
        "The cat sat on the mat",
        "The dog barked at the cat",
        "The mouse ran across the room",
        "The cat chased the mouse"
    ]

    # Compute TF-IDF matrix
    tfidf_matrix, feature_names, vectorizer = compute_tfidf(documents)
    print("TF-IDF Matrix:")
    print(tfidf_matrix.toarray())
    print("\nFeature Names:")
    print(feature_names)

    # Calculate cosine similarity between documents
    doc_index1 = 0  # First document
    doc_index2 = 1  # Second document
    similarity = cosine_similarity_docs(tfidf_matrix, doc_index1, doc_index2)
    print(f"\nCosine Similarity between Document {doc_index1} and Document {doc_index2}: {similarity:.4f}")

    # Calculate cosine similarity between words
    word1 = "cat"
    word2 = "dog"
    word_similarity = cosine_similarity_words(word1, word2, vectorizer, feature_names, tfidf_matrix)
    print(f"\nCosine Similarity between words '{word1}' and '{word2}': {word_similarity:.4f}")

TF-IDF Matrix:
[[0.         0.         0.         0.30100231 0.         0.
  0.47157828 0.         0.47157828 0.         0.         0.47157828
  0.49217822]
 [0.         0.47157828 0.47157828 0.30100231 0.         0.47157828
  0.         0.         0.         0.         0.         0.
  0.49217822]
 [0.46073328 0.         0.         0.         0.         0.
  0.         0.36324741 0.         0.46073328 0.46073328 0.
  0.48085948]
 [0.         0.         0.         0.36145869 0.56629489 0.
  0.         0.4464734  0.         0.         0.         0.
  0.59103233]]

Feature Names:
['across' 'at' 'barked' 'cat' 'chased' 'dog' 'mat' 'mouse' 'on' 'ran'
 'room' 'sat' 'the']

Cosine Similarity between Document 0 and Document 1: 0.3328

Cosine Similarity between words 'cat' and 'dog': 0.5390


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter, defaultdict
import itertools

# Function to preprocess text (simple tokenization)
def preprocess(text):
    return text.lower().split()

# Function to build co-occurrence matrix
def build_cooccurrence_matrix(corpus, window_size=2):
    vocabulary = set()
    word_counts = Counter()
    cooccurrence_counts = defaultdict(lambda: defaultdict(int))

    # Tokenize sentences and build counts
    for sentence in corpus:
        tokens = preprocess(sentence)
        vocabulary.update(tokens)
        word_counts.update(tokens)

        for i, word in enumerate(tokens):
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    cooccurrence_counts[word][tokens[j]] += 1

    vocabulary = sorted(vocabulary)
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
    cooccurrence_matrix = np.zeros((len(vocabulary), len(vocabulary)))

    for word, neighbors in cooccurrence_counts.items():
        for neighbor, count in neighbors.items():
            cooccurrence_matrix[word_to_index[word]][word_to_index[neighbor]] = count

    return cooccurrence_matrix, word_to_index, vocabulary

# Function to compute PPMI matrix
def compute_ppmi_matrix(cooccurrence_matrix):
    total_sum = np.sum(cooccurrence_matrix)
    word_sum = np.sum(cooccurrence_matrix, axis=1)
    context_sum = np.sum(cooccurrence_matrix, axis=0)

    ppmi_matrix = np.zeros_like(cooccurrence_matrix)
    for i in range(cooccurrence_matrix.shape[0]):
        for j in range(cooccurrence_matrix.shape[1]):
            p_wc = cooccurrence_matrix[i][j] / total_sum
            p_w = word_sum[i] / total_sum
            p_c = context_sum[j] / total_sum

            if p_wc > 0:
                ppmi = max(0, np.log2(p_wc / (p_w * p_c)))
                ppmi_matrix[i][j] = ppmi

    return ppmi_matrix

# Function to compute cosine similarity between two vectors
def cosine_similarity_vectors(vec1, vec2):
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

# Function to compute cosine similarity between two words
def cosine_similarity_words(word1, word2, ppmi_matrix, word_to_index):
    if word1 not in word_to_index or word2 not in word_to_index:
        return 0.0
    index1 = word_to_index[word1]
    index2 = word_to_index[word2]
    return cosine_similarity_vectors(ppmi_matrix[index1], ppmi_matrix[index2])

# Example usage
if __name__ == "__main__":
    corpus = [
        "the cat sat on the mat",
        "the dog barked at the cat",
        "the mouse ran across the room",
        "the cat chased the mouse"
    ]

    # Build co-occurrence and PPMI matrices
    cooccurrence_matrix, word_to_index, vocabulary = build_cooccurrence_matrix(corpus, window_size=2)
    ppmi_matrix = compute_ppmi_matrix(cooccurrence_matrix)

    print("Vocabulary:")
    print(vocabulary)
    # print("\nPPMI Matrix:")
    # print(ppmi_matrix)

    # Cosine similarity between words
    word1 = "cat"
    word2 = "dog"
    similarity = cosine_similarity_words(word1, word2, ppmi_matrix, word_to_index)
    print(f"\nCosine Similarity between '{word1}' and '{word2}': {similarity:.4f}")

    # Cosine similarity between two documents
    doc1_vector = np.sum(ppmi_matrix, axis=0)  # Sum word vectors for a document
    doc2_vector = np.sum(ppmi_matrix, axis=1)  # Sum word vectors for a second document
    doc_similarity = cosine_similarity_vectors(doc1_vector, doc2_vector)
    print(f"\nCosine Similarity between Document 1 and Document 2: {doc_similarity:.4f}")

Vocabulary:
['across', 'at', 'barked', 'cat', 'chased', 'dog', 'mat', 'mouse', 'on', 'ran', 'room', 'sat', 'the']

Cosine Similarity between 'cat' and 'dog': 0.3500

Cosine Similarity between Document 1 and Document 2: 1.0000


In [None]:
import re
from collections import defaultdict, Counter
import math

# Training Data: Provided sentences with sense labels
train_data = [
    ("I love fish. The smoked bass fish was delicious.", "fish"),
    ("The bass fish swam along the line.", "fish"),
    ("He hauled in a big catch of smoked bass fish.", "fish"),
    ("The bass guitar player played a smooth jazz line.", "guitar"),
]

# Preprocess the sentences: tokenize and clean
def preprocess(sentence):
    return re.findall(r'\b\w+\b', sentence.lower())  # Lowercase and tokenize

# Build vocabulary and class-wise word counts
def train_naive_bayes(train_data):
    vocab = set()
    word_counts = defaultdict(Counter)  # Word counts per class
    class_counts = defaultdict(int)     # Count of sentences per class

    for sentence, label in train_data:
        tokens = preprocess(sentence)
        vocab.update(tokens)
        word_counts[label].update(tokens)
        class_counts[label] += 1

    return vocab, word_counts, class_counts

# Calculate log probabilities using Add-1 Smoothing
def calculate_log_probabilities(vocab, word_counts, class_counts):
    total_classes = sum(class_counts.values())
    log_probs = {}
    total_vocab_size = len(vocab)

    for label in class_counts:
        log_probs[label] = {
            'class_log_prob': math.log(class_counts[label] / total_classes),
            'word_log_probs': {}
        }
        total_words = sum(word_counts[label].values())

        for word in vocab:
            word_freq = word_counts[label][word] + 1  # Add-1 smoothing
            log_probs[label]['word_log_probs'][word] = math.log(word_freq / (total_words + total_vocab_size))

    return log_probs

# Predict the sense of the test word based on the test sentence
def predict(test_sentence, target_word, log_probs, vocab):
    tokens = preprocess(test_sentence)
    best_label = None
    best_log_prob = float('-inf')

    # Check probabilities for each class
    for label in log_probs:
        total_log_prob = log_probs[label]['class_log_prob']
        for word in tokens:
            if word in vocab:
                total_log_prob += log_probs[label]['word_log_probs'].get(word, 0)

        # Update best label if higher probability is found
        if total_log_prob > best_log_prob:
            best_log_prob = total_log_prob
            best_label = label

    return best_label

# Main function
def main():
    # Train the Naive Bayes model
    vocab, word_counts, class_counts = train_naive_bayes(train_data)
    log_probs = calculate_log_probabilities(vocab, word_counts, class_counts)

    # Test sentence
    test_sentence = "He loves jazz. The bass line provided the foundation for the guitar solo in the jazz piece"
    test_word = "bass"

    # Predict the sense of 'bass'
    predicted_sense = predict(test_sentence, test_word, log_probs, vocab)
    print(f"Test sentence: {test_sentence}")
    print(f"Test word: {test_word}")
    print(f"Output: {predicted_sense}")

if __name__ == "__main__":
    main()

Test sentence: He loves jazz. The bass line provided the foundation for the guitar solo in the jazz piece
Test word: bass
Output: guitar
