In [2]:
import re 
import os 
import random

In [3]:
DATA_FILE = "output_language_text.txt"
MARKOV_ORDER = 2
# some constants and file paths 

In [16]:
class MarkovChain:
    def __init__(self, order=MARKOV_ORDER):
        self.order = order
        # transitions is a dictionary of prefixes i.e. tuple of length markov order, and value is the next word
        self.transitions = {}

    def train(self, sequence):
        if len(sequence) < self.order + 1:
            print("Sequence too short to train chain of this order")
            return
        
        for i in range(len(sequence) - self.order):
            prefix = tuple(sequence[i:i+self.order])
            next_item = sequence[i + self.order]

            if(prefix not in self.transitions):
                self.transitions[prefix] = []
            self.transitions[prefix].append(next_item)

    
    def generate_next(self, current_prefix):
        if current_prefix not in self.transitions:
            all_possible_next_items = [item for sublist in self.transitions.values() for item in sublist]
            if not all_possible_next_items:
                raise StopIteration("Chain is stuck, no possible next state")

            return random.choice(all_possible_next_items)
        
        return random.choice(self.transitions[current_prefix])
    
def read_and_preprocess_data(filepath):
    if not os.path.exists(filepath):
        return None
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
            text = text.replace("___", "").strip()
            return text
    except Exception as e:
        print("Error reading file")
        return None
    

def tokenize_words(text):
    tokens = re.findall(r'[\w\u0900-\u097f]+', text, re.UNICODE)
    return tokens

def tokenize_sentences(text):
    sentences = re.split(r'[.।?!]', text)
    return [s.strip() for s in sentences if s.strip()]


def generate_text(chain, initial_sequence, length):
    """Generates a sequence of a given length using the trained chain."""
    
    # Ensure the initial sequence is the correct size for the order
    current_sequence = list(initial_sequence[-chain.order:])
    generated_sequence = list(current_sequence)
    
    for _ in range(length):
        prefix = tuple(current_sequence)
        
        # If the model gets stuck and the fallback fails (very rare), break.
        try:
            # Generate the next item (handles unknown prefix internally)
            next_item = chain.generate_next(prefix)
        except StopIteration:
            print("\n(Chain stopped unexpectedly.)", file=sys.stderr)
            break

        # Append to the generated text and update the sequence for the next prediction
        generated_sequence.append(next_item)
        current_sequence.pop(0)
        current_sequence.append(next_item)
        
    # Return only the newly generated part (excluding the initial sequence)
    return generated_sequence[chain.order:]



def run_general_generation(data):
    """Implements Part A: Character, Word, and Sentence generation."""
    print("\n" + "="*50)
    print("Part A: Character, Word, and Sentence Generation")
    print("="*50)

    # --- 1. Character Generation ---
    print("\n--- 1. Character Generation (Length: 100) ---")
    char_chain = MarkovChain(order=2)
    char_sequence = list(data)
    char_chain.train(char_sequence)
    
    start_char_prefix = tuple(char_sequence[0:2])
    generated_chars = generate_text(char_chain, start_char_prefix, 100)
    print("".join(generated_chars))

    # --- 2. Word Generation ---
    print("\n--- 2. Word Generation (Length: 30 Words) ---")
    words = tokenize_words(data)
    word_chain = MarkovChain(order=2)
    word_chain.train(words)
    
    start_word_prefix = tuple(words[0:2])
    generated_words = generate_text(word_chain, start_word_prefix, 30)
    print(" ".join(generated_words))

    # --- 3. Sentence Generation ---
    print("\n--- 3. Sentence Generation (1 Sentence) ---")
    # Generate a single sentence by generating 10 words and hoping it sounds cohesive
    words = tokenize_words(data)
    word_chain = MarkovChain(order=3) # Higher order helps for better local coherence
    word_chain.train(words)
    
    start_word_prefix = random.choice([tuple(words[i:i+3]) for i in range(len(words)-3)])
    generated_words = generate_text(word_chain, start_word_prefix, 15)
    
    # Capitalize the first word and add a sentence ending
    sentence_text = " ".join(generated_words).capitalize() + "।"
    print(sentence_text)



def predict_next_words(word_chain, input_phrase, count=2):
    """Implements Part C: Predicts the next two words based on input."""
    
    input_words = tokenize_words(input_phrase)
    
    if len(input_words) < word_chain.order:
        print(f"Error: Input phrase must be at least {word_chain.order} words long for this model order.")
        return ""
    
    # Get the prefix (the last 'order' words)
    start_prefix = tuple(input_words[-word_chain.order:])
    
    # Generate the words
    next_words = generate_text(word_chain, start_prefix, count)
    
    return " ".join(next_words)



def predict_next_sentence(data, custom_query):
    """
    Implements Part D: A simple conversational system based on sentence transitions.
    It links one sentence to the next in the corpus and uses that as a Q&A pattern.
    """
    print("\n" + "="*50)
    print("Part D: Conversational System (Next Sentence Prediction)")
    print("="*50)
    
    sentences = tokenize_sentences(data)
    if len(sentences) < 2:
        print("Not enough sentences to build a conversational model.")
        return

    # Train a Markov Chain where the state is the PREVIOUS sentence
    # and the next item is the CURRENT sentence. This simulates conversation flow.
    sentence_chain = MarkovChain(order=1)
    
    # Build a sequence of (Sentence1, Sentence2, Sentence3, ...)
    # This chain will map: (Sentence_N) -> [Sentence_N+1]
    sentence_chain.train(sentences)
    
    # Input/Output Examples
    
    # 1. Use a sentence directly from the corpus to find its 'answer'
    query_1 = sentences[random.randint(0, len(sentences) - 2)]
    
    # The prefix is the full sentence itself
    response_1 = sentence_chain.generate_next(tuple([query_1]))
    
    print(f"Query (Corpus Sentence): \"{query_1}\"")
    print(f"Response (Next Sentence): \"{response_1}\"")
    
    # 2. Use a custom query and try to find the best match to initiate a response
    custom_query = custom_query or "जापान की राजधानी क्या है?"  # What is the capital of Japan?
    
    # Find the sentence in the corpus that has the most matching words
    best_match = max(sentences, key=lambda s: len(set(tokenize_words(custom_query)) & set(tokenize_words(s))))
    
    response_2 = sentence_chain.generate_next(tuple([best_match]))
    
    print(f"\nQuery (Custom Match): \"{custom_query}\"")
    print(f"Best Corpus Match: \"{best_match}\"")
    print(f"Response (Next Sentence): \"{response_2}\"")


        

In [9]:
corpus_data = read_and_preprocess_data(DATA_FILE)
if corpus_data: 
    run_general_generation(corpus_data)
        
        # --- Setup for Part C (Word Prediction) ---
    words_for_c = tokenize_words(corpus_data)
    word_chain_c = MarkovChain(order=2) # Order 2 is standard for word prediction
    word_chain_c.train(words_for_c)
        
    print("\n" + "="*50)
    print("Part C: Next Two Words Prediction")
    print("="*50)

        # Example: Predict next two words for "India is" (Hindi equivalent)
    input_phrase = "संयुक्त राज्य अमेरिका" # United States of America
    next_two_words = predict_next_words(word_chain_c, input_phrase, count=2)
        
    print(f"Input: \"{input_phrase}\" --> Output: \"{next_two_words}\"")
        
        # Example 2
    input_phrase_2 = "रूस एक" # Russia is a
    next_two_words_2 = predict_next_words(word_chain_c, input_phrase_2, count=2)
    print(f"Input: \"{input_phrase_2}\" --> Output: \"{next_two_words_2}\"")


        # --- Run Part D (Conversational System) ---
    predict_next_sentence(corpus_data)
        
else:
    print("Cannot run Markov generation without corpus data.")


Part A: Character, Word, and Sentence Generation

--- 1. Character Generation (Length: 100) ---
 एकतीन और में जापान पर्म की से रूप रूप है।
वर्व (Волов)
2010 में) सक एक्षेत्तरीफ़िंग्रहवीपीय आपर्खा 

--- 2. Word Generation (Length: 30 Words) ---
日本 निप्पोन या निहोन एशिया महाद्वीप के पूर्व में कनाडा की सीमा है। रूस की आधारशिला कहा जा सकता है। जापान की संस्कृति का अंधानुकरण किया है। बौद्ध धर्म यहां

--- 3. Sentence Generation (1 Sentence) ---
के रूप हैं जुड़वा अँगूठी motegi था होंडा द्वारा 1997 में पूरा करने के लिए।

Part C: Next Two Words Prediction
Input: "संयुक्त राज्य अमेरिका" --> Output: "के संयुक्त"
Input: "रूस एक" --> Output: "से के"

Part D: Conversational System (Next Sentence Prediction)
Query (Corpus Sentence): "जापान एक सबसे सफल एशिया में फुटबॉल टीमों में से एक है, एशियाई कप जीतने तीन बार"
Response (Next Sentence): "गोल्फ भी जापान, के रूप में लोकप्रिय है सुपर जी"

Query (Custom Match): "जापान की राजधानी क्या है?"
Best Corpus Match: "जापान की राजधानी टोक्यो है और उसके अन्य बड़े

In [14]:
DATA_FILE2 = "output_convo.txt"
from scraper import scrape_all
urls_list_convo = [
    "https://www.reddit.com/r/selfhosted/comments/vw1oyi/a_static_and_simple_forum_solution/",
    "https://www.reddit.com/r/devops/comments/1on5cjn/how_can_i_improve_my_kubernetes_and_cloud_skills/",
    "https://www.reddit.com/r/Backend/comments/1ollqqr/suggest_a_good_backend_project_that_has_real_life/",
    "https://www.reddit.com/r/Backend/comments/1olkzqn/confused_between_learning_java_spring_boot_or/",
    "https://www.reddit.com/r/golang/comments/1nta34y/awesome_go_applications_open_source/",
    "https://www.reddit.com/r/Compilers/comments/1oon6p5/handling_expressions_with_parsers/",
    "https://www.reddit.com/r/Compilers/comments/1ojkq04/gpu_vs_ml_compiler_engineer/",
    "https://www.reddit.com/r/Compilers/comments/1onc170/interview_for_a_ml_compiler_role_at_waymo/",
    "https://www.reddit.com/r/Compilers/comments/1odbjq2/automated_generation_of_highlevel_code_from/",
    "https://www.reddit.com/r/compsci/comments/1ooyxmj/seriously_llms_are_killing_captcha_need_2_mins_of/",
    "https://www.reddit.com/r/compsci/comments/1oktglu/i_built_a_python_debugging_tool_that_uses/",
    "https://www.reddit.com/r/compsci/comments/1oktglu/i_built_a_python_debugging_tool_that_uses/",
    "https://www.reddit.com/r/ChatGPT/comments/1oovik0/i_trapped_an_llm_in_a_small_box_and_told_him_to/",
    "https://www.reddit.com/r/developersIndia/comments/1ooq6br/indiabiz5_nontechnical_founder_struggling_with_my/",
    "https://www.reddit.com/r/devops/comments/1ookpme/leetcode_style_interview_for_devops_role/",
    "https://reddit.com/r/computerscience",
    "https://reddit.com/r/compsci",
    "https://reddit.com/r/cscareerquestions",
    "https://reddit.com/r/cscareerquestionsOCE",
    "https://reddit.com/r/cscareerquestionsEU",
    "https://reddit.com/r/cscareerquestionsCAD",
    "https://reddit.com/r/csMajors",
    "https://reddit.com/r/cseducation",
    "https://reddit.com/r/MSCS",
    "https://reddit.com/r/cscareers",
    "https://reddit.com/r/AskComputerScience",
    "https://reddit.com/r/algorithms",
    "https://reddit.com/r/programming",
    "https://reddit.com/r/learnprogramming",
    "https://reddit.com/r/coding",
    "https://reddit.com/r/webdev",
    "https://reddit.com/r/SideProject",
    "https://reddit.com/r/javascript",
    "https://reddit.com/r/learnjavascript",
    "https://reddit.com/r/reactjs",
    "https://reddit.com/r/database",
    "https://reddit.com/r/mongodb",
    "https://reddit.com/r/mysql",
    "https://reddit.com/r/PostgreSQL",
    "https://reddit.com/r/redis",
    "https://reddit.com/r/datascience",
    "https://reddit.com/r/datasets",
    "https://reddit.com/r/machinelearning",
    "https://reddit.com/r/MLQuestions",
    "https://reddit.com/r/artificial",
    "https://reddit.com/r/LanguageTechnology",
    "https://reddit.com/r/computervision",
    "https://reddit.com/r/networking",
    "https://reddit.com/r/opensource",
    "https://reddit.com/r/softwaredevelopment",
    "https://reddit.com/r/tinycode",
    "https://reddit.com/r/git",
    "https://reddit.com/r/github",
    "https://reddit.com/r/cpp",
    "https://reddit.com/r/Cplusplus",
    "https://reddit.com/r/LearnCpp",
    "https://reddit.com/r/Cpp_questions",
    "https://reddit.com/r/Csharp",
    "https://reddit.com/r/dotnet",
    "https://reddit.com/r/GameDev",
    "https://reddit.com/r/truegamedev",
    "https://reddit.com/r/UnrealEngine",
    "https://reddit.com/r/Unity3D",
    "https://reddit.com/r/godot",
    "https://reddit.com/r/JustGameDevThings",
    "https://reddit.com/r/bash",
    "https://reddit.com/r/commandline",
    "https://reddit.com/r/shell",
    "https://reddit.com/r/emacs",
    "https://reddit.com/r/neovim",
    "https://reddit.com/r/vim",
    "https://reddit.com/r/vscode",
    "https://reddit.com/r/arduino",
    "https://reddit.com/r/embedded",
    "https://reddit.com/r/raspberry_pi",
    "https://reddit.com/r/dotfiles",
    "https://reddit.com/r/regex",
    "https://reddit.com/r/FunctionalProgramming",
    "https://reddit.com/r/ProgrammingLanguages",
    "https://reddit.com/r/AskCompSci",
    "https://reddit.com/r/DatabaseHelp",
    "https://reddit.com/r/mariadb",
    "https://reddit.com/r/RethinkDB",
    "https://reddit.com/r/SQLServer",
    "https://reddit.com/r/security",
    "https://reddit.com/r/netsec",
    "https://reddit.com/r/computerforensics",
    "https://reddit.com/r/crypto",
    "https://reddit.com/r/hackernews",
    "https://reddit.com/r/dataisbeautiful",
    "https://reddit.com/r/coolgithubprojects",
    "https://reddit.com/r/softwaregore",
    "https://reddit.com/r/unixporn",
    "https://reddit.com/r/WatchPeopleCode",
    "https://reddit.com/r/ProgrammerHumor",
    "https://reddit.com/r/programmerreactions",
    "https://reddit.com/r/itsaunixsystem",
    "https://reddit.com/r/epochfail",
    "https://reddit.com/r/RecruitingHell",
    "https://reddit.com/r/WebDeveloperJobs",
    "https://reddit.com/r/learnpython",
    "https://reddit.com/r/python",
    "https://reddit.com/r/java",
    "https://reddit.com/r/csharp",
    "https://reddit.com/r/swift",
    "https://reddit.com/r/ruby",
    "https://reddit.com/r/golang",
    "https://reddit.com/r/scala",
    "https://reddit.com/r/rust",
    "https://reddit.com/r/c",
    "https://reddit.com/r/objectivec",
    "https://reddit.com/r/perl",
    "https://reddit.com/r/php",
    "https://reddit.com/r/typescript",
    "https://reddit.com/r/assembly",
    "https://reddit.com/r/dotnetcore",
    "https://reddit.com/r/flutter",
    "https://reddit.com/r/kotlin"
]

scrape_all(urls_list_convo, DATA_FILE2)

Scraping complete lol


In [18]:
corpus_data2 = read_and_preprocess_data(DATA_FILE2)
if corpus_data2: 
    run_general_generation(corpus_data2)
        
        # --- Setup for Part C (Word Prediction) ---
    words_for_c = tokenize_words(corpus_data2)
    word_chain_c = MarkovChain(order=2) # Order 2 is standard for word prediction
    word_chain_c.train(words_for_c)
        
    print("\n" + "="*50)
    print("Part C: Next Two Words Prediction")
    print("="*50)

    input_phrase = "help me self host on my old laptop" # United States of America
    next_two_words = predict_next_words(word_chain_c, input_phrase, count=2)
        
    print(f"Input: \"{input_phrase}\" --> Output: \"{next_two_words}\"")
        
        # Example 2
    input_phrase_2 = "kubernetes is a "
    next_two_words_2 = predict_next_words(word_chain_c, input_phrase_2, count=2)
    print(f"Input: \"{input_phrase_2}\" --> Output: \"{next_two_words_2}\"")


        # --- Run Part D (Conversational System) ---
    predict_next_sentence(corpus_data2, "this is a custom query")
        
else:
    print("Cannot run Markov generation without corpus data.")


Part A: Character, Word, and Sentence Generation

--- 1. Character Generation (Length: 100) ---
stilts-ons of) hation.
        

   Whobile off-prolowe sag a re entes.   


          

       

  

--- 2. Word Generation (Length: 30 Words) ---
to advertise promote products or services or engage in such behavior will be removed based on ingredients available in the sub before posting If it did not create Git For

--- 3. Sentence Generation (1 Sentence) ---
Create your account and connect with a world of communities anyone can view post and।

Part C: Next Two Words Prediction
Input: "help me self host on my old laptop" --> Output: "into board"
Input: "kubernetes is a " --> Output: "TUI terminal"

Part D: Conversational System (Next Sentence Prediction)
Query (Corpus Sentence): "Say that AI is so incredibly effective and well developed in two years that it eliminates 50% of all work that we have to do"
Response (Next Sentence): "Okay"

Query (Custom Match): "this is a custom query"
Best 