In [None]:

import nltk
import random
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
nltk.download('punkt')
    

In [None]:

# Sample text
text = "This is a simple example to demonstrate N-gram language modeling. This is just a test."

# Preprocessing
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
tokens = word_tokenize(text)
    

In [None]:

# Generate N-grams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

unigrams = generate_ngrams(tokens, 1)
bigrams = generate_ngrams(tokens, 2)
trigrams = generate_ngrams(tokens, 3)
    

In [None]:

# Compute probabilities
def calculate_probabilities(ngrams_list):
    counts = Counter(ngrams_list)
    total_count = sum(counts.values())
    probabilities = {ngram: count / total_count for ngram, count in counts.items()}
    return probabilities

trigram_probs = calculate_probabilities(trigrams)
    

In [None]:

# Generate text
def generate_text(seed_words, length):
    sentence = list(seed_words)
    for _ in range(length):
        possible_trigrams = [trigram for trigram in trigram_probs if trigram[:2] == tuple(sentence[-2:])]
        if not possible_trigrams:
            break
        next_word = random.choices([trigram[2] for trigram in possible_trigrams], 
                                   weights=[trigram_probs[tg] for tg in possible_trigrams])[0]
        sentence.append(next_word)
    return ' '.join(sentence)

# Example usage
print("Generated Text:", generate_text(("this", "is"), 5))
    

In [None]:

# Generate N-grams for 1 to 5
def generate_ngram_counts(tokens, n):
    ngrams_list = generate_ngrams(tokens, n)
    return Counter(ngrams_list)

ngram_counts = {n: generate_ngram_counts(tokens, n) for n in range(1, 6)}

# Convert to DataFrame for analysis
df = pd.DataFrame({n: list(ngram_counts[n].values()) for n in range(1, 6)})
df.plot(kind='bar', figsize=(10, 5), title="N-gram Frequency Distribution")
plt.show()
    

In [None]:

# Read text file
with open("sample.txt", "r") as file:
    text = file.read().lower()

text = re.sub(r'[^\w\s]', '', text)
tokens = word_tokenize(text)

# Generate N-grams and their frequencies
unigram_freq = Counter(generate_ngrams(tokens, 1))
bigram_freq = Counter(generate_ngrams(tokens, 2))
trigram_freq = Counter(generate_ngrams(tokens, 3))

print("Unigrams:", unigram_freq.most_common(10))
print("Bigrams:", bigram_freq.most_common(10))
print("Trigrams:", trigram_freq.most_common(10))
    

In [None]:

# Sample domain-specific text (e.g., social media)
text = "omg this new phone is awesome!!! can't wait to try it #excited"

# Domain-specific preprocessing
text = text.lower()
text = re.sub(r'#[\w]+', '', text)  # Remove hashtags
text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
tokens = word_tokenize(text)

# Generate and compute probabilities
trigram_probs = calculate_probabilities(generate_ngrams(tokens, 3))

print("Top Trigrams:", sorted(trigram_probs.items(), key=lambda x: -x[1])[:5])
    