<a href="https://colab.research.google.com/github/arbin34/heruk/blob/main/LM_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wxconv



In [None]:
import os
import re
import string
import CONSTANTS
import logging
from wxconv import WXC

# Define connectives as dictionaries for better organization
SIMPLE_CONNECTIVES = { 'कि', 'और', 'एवं', 'इसलिए', 'क्योंकि', 'जबकि', 'तथा', 'ताकि', 'मगर', 'लेकिन', 'किंतु', 'परंतु', 'फिर', 'तथापि',
                      'नहीं तो'}
COMPLEX_CONNECTIVES = {}

def log(message, log_type='OK'):
    """Generates log message in a predefined format."""
    print(f'[{log_type}] : {message}')
    if log_type == 'ERROR':
        sys.exit()
def read_input(file_path):
    """Reads and returns a dictionary with sentence_id as the key and the sentence as the value."""
    log(f'File ~ {file_path}')
    input_data = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            for line in lines:
                line = line.strip()
                if line:
                    sentence_info = line.split(' ', 1)
                    key, value = sentence_info[0], sentence_info[1].strip()
                    input_data[key] = value
            log('File data read.')
    except FileNotFoundError:
        log('No such file found.', 'ERROR')
        sys.exit()
    return input_data

def clean(word):
    """Clean concept words by removing numbers and special characters using regex."""
    return re.sub(r'[0-9!@#$%^&*()]', '', word)

def validate_sentence(sentence):
    """Validate if the sentence is not empty and contains non-digit characters."""
    return bool(sentence) and any(char.isalpha() for char in sentence)

def split_sentence(sentence):
    """Split the sentence based on both SIMPLE_CONNECTIVES and COMPLEX_CONNECTIVES."""
    connectives = '|'.join(map(re.escape, SIMPLE_CONNECTIVES | set(COMPLEX_CONNECTIVES.keys())))
    parts = re.split(f'({connectives})', sentence)
    return [part.strip() for part in parts if part.strip()]

def sanitize_input(sentence):
    wx_format = WXC(order="utf2wx", lang="hin")
    generate_wx_text = wx_format.convert(sentence)
    clean_wx_text = " ".join([clean(word) for word in generate_wx_text.strip().split()])
    hindi_format = WXC(order="wx2utf", lang="hin")
    clean_hindi_text = hindi_format.convert(clean_wx_text).strip()
    if clean_hindi_text.endswith('.'):
        clean_hindi_text = clean_hindi_text[:-1] + " ।"
    return clean_hindi_text

def write_output(dictionary, file_path, manual_evaluation):
    with open(file_path, 'w', encoding='utf-8') as file:
        for key, values in dictionary.items():
            # Initialize a counter for sub-sentences
            sub_sentence_counter = 1
            for item in values:
                tag = 'Manual evaluation' if item in manual_evaluation else 'None'
                # Each sub-sentence should end with poornaviram
                if not item.endswith('।'):
                    item += ' ।'
                # Construct the full sentence line with a sub-sentence identifier
                line = f'{key}{string.ascii_lowercase[sub_sentence_counter - 1]}  {item}  {tag}\n'
                file.write(line)
                sub_sentence_counter += 1
    log("Output file written successfully")

def separate_sentences_with_connectives(sentence):
    simpler_sentences = []
    connective_indices = []

    for connective in SIMPLE_CONNECTIVES | set(COMPLEX_CONNECTIVES.keys()):
        if connective in sentence:
            indices = [m.start() for m in re.finditer(connective, sentence)]
            connective_indices.extend((i, i + len(connective)) for i in indices)

    connective_indices.sort()
    start = 0
    for i, j in connective_indices:
        part = sentence[start:i].strip()
        if part:
            simpler_sentences.append(part)
        start = j

    last_part = sentence[start:].strip()
    if last_part:
        simpler_sentences.append(last_part)

    return simpler_sentences

# Other functions and main code remain mostly the same as the previous response.


def is_prev_word_verb(parser_output, index):
    try:
        with open(parser_output, 'r') as file:
            lines = file.readlines()
            for i in range(len(lines)):
                if i == index:
                    lineContent = lines[i].strip().split()
                    if len(lineContent) > 0 and (lineContent[1] == 'VM' or lineContent[1] == 'VAUX'):
                        return True

    except FileNotFoundError:
        log('No such File found.', 'ERROR')
        sys.exit()
    return False

def get_index_of_word(words, value):
    index = -1
    for i in range(len(words)):
        if words[i] == value:
            index = i
            break
    return index

def get_word_at_index(words, index):
    word = ""
    for i in range(len(words)):
        if i == index:
            word = words[i]
            break
    return word

def get_POS_by_index(parser_output, index):
    tag = ''
    try:
        with open(parser_output, 'r') as file:
            lines = file.readlines()
            for i in range(len(lines)):
                row = lines[i].strip().split()
                if len(row) == 10 and row[0] == str(index + 1):
                    tag = row[3]
                    break
            return tag

    except FileNotFoundError:
        log('No such File found.', 'ERROR')
        sys.exit()

def get_dep_by_index(parser_output, index):
    dep = ''
    try:
        with open(parser_output, 'r') as file:
            lines = file.readlines()
            for i in range(len(lines)):
                row = lines[i].strip().split()
                if len(row) == 10 and row[0] == str(index+1):
                    dep = row[7]
                    break
            return dep

    except FileNotFoundError:
        log('No such File found.', 'ERROR')
        sys.exit()

def breakPairConnective(sentence, manual_evaluation):
    # This function return list of sentences if a paired connective is found else returns an empty list
    simpler_sentences = []
    BREAK_SENTENCE = False
    # Tokenize the sentence by splitting it into words
    tokens = sentence.split()
    # Iterate through the tokens to find connectives and split the sentence
    for i in range(len(tokens)):
        token = tokens[i]
        # Check if the token is a paired-connective
        if token in CONSTANTS.COMPLEX_CONNECTIVES:
            pair_value_lst = CONSTANTS.COMPLEX_CONNECTIVES[token]
            for pair_value in pair_value_lst:
                if pair_value in sentence:
                    pair_value = pair_value.strip().split()[0]
                    index_of_pair_value = get_index_of_word(tokens, pair_value)
                    if not (index_of_pair_value == -1):
                        get_tagger_output(sentence)
                        if is_prev_word_verb(CONSTANTS.PARSER_OUTPUT, index_of_pair_value - 1):
                            tokens.pop(i)
                            index_of_pair_value = index_of_pair_value - 1
                            sent1 = tokens[:index_of_pair_value]
                            sent2 = tokens[index_of_pair_value:]
                            simpler_sentences.append(" ".join(sent1))
                            simpler_sentences.append(" ".join(sent2))
                            BREAK_SENTENCE = True
                            break
                        else:
                            manual_evaluation.append(sentence)
            if BREAK_SENTENCE:
                break
    return simpler_sentences

def breakSimpleConnective(sentence, manual_evaluation):
    # This function returns a list of sentences if a simple connective is found; otherwise, it returns an empty list
    simpler_sentences = []
    # Tokenize the sentence by splitting it into words
    tokens = sentence.split()
    for i in range(len(tokens)):  # Use range(len(tokens)) to iterate through indices
        token = tokens[i]
        # 'नहीं तो' is a simple connective
        if token == 'नहीं':
            following_word = get_word_at_index(tokens, i + 1)
            if following_word == 'तो':
                token = 'नहीं तो'

        # Check if the token is a connective
        if token in CONSTANTS.SIMPLE_CONNECTIVES:
            if token == 'और' or token == 'एवं' or token == 'तथा' or token == 'या':
                get_parser_output(sentence)
                token_POS = get_POS_by_index(CONSTANTS.PARSER_OUTPUT, i)
                token_dep = get_dep_by_index(CONSTANTS.PARSER_OUTPUT, i)
                get_tagger_output(sentence)
                if token_POS == 'CC' and token_dep == 'main' and is_prev_word_verb(CONSTANTS.PARSER_OUTPUT, i - 1):
                    sent1 = tokens[:i]
                    sent2 = tokens[i:]
                    simpler_sentences.append(" ".join(sent1))
                    simpler_sentences.append(" ".join(sent2))
                    break
                elif i > 1:
                    manual_evaluation.append(sentence)

            else:
                get_tagger_output(sentence)
                if is_prev_word_verb(CONSTANTS.PARSER_OUTPUT, i - 1):
                    sent1 = tokens[:i]
                    sent2 = tokens[i:]
                    simpler_sentences.append(" ".join(sent1))
                    simpler_sentences.append(" ".join(sent2))
                    break
                elif i > 1:
                    manual_evaluation.append(sentence)

    return simpler_sentences


def write_input_in_parser_input(file_path, sentence):
    with open(file_path, 'w') as file:
        file.truncate()
        file.write(sentence)
        file.close()

def get_tagger_output(sentence):
    parser_input_file = CONSTANTS.PARSER_INPUT
    write_input_in_parser_input(parser_input_file, sentence)
    with open(CONSTANTS.PARSER_OUTPUT, 'w') as file:
        file.truncate()
    os.system("isc-tagger -i p_parser_input.txt -o p_parser_output.txt")

def get_parser_output(sentence):
    parser_input_file = CONSTANTS.PARSER_INPUT
    write_input_in_parser_input(parser_input_file, sentence)
    with open(CONSTANTS.PARSER_OUTPUT, 'w') as file:
        file.truncate()
    os.system("isc-parser -i p_parser_input.txt -o p_parser_output.txt")

def breakAllPairedConnective(sentence, allPairedConnectiveList, manual_evaluation):
    simpler_sentences = breakPairConnective(sentence, manual_evaluation)
    if len(simpler_sentences) == 0:
        allPairedConnectiveList.append(sentence)
        return

    for s in simpler_sentences:
        breakAllPairedConnective(s, allPairedConnectiveList, manual_evaluation)

    return

def breakAllSimpleConnective(sentence, allSimpleConnectiveList, manual_evaluation):
    simpler_sentences = breakSimpleConnective(sentence, manual_evaluation)
    if len(simpler_sentences) == 0:
        allSimpleConnectiveList.append(sentence)
        return

    for s in simpler_sentences:
        breakAllSimpleConnective(s, allSimpleConnectiveList, manual_evaluation)

    return
if __name__ == '__main__':
    input_data = read_input(CONSTANTS.INPUT_FILE)
    output_data = {}
    manual_evaluation = []

    for key, value in input_data.items():
        if validate_sentence(value):
            value = sanitize_input(value)
            simpler_sentences = separate_sentences_with_connectives(value)

            # First break the sentence by pair connectives
            allPairedConnectiveList = []
            for s in simpler_sentences:
                breakAllPairedConnective(s, allPairedConnectiveList, manual_evaluation)
            allSimpleConnectiveList = []

            for s in allPairedConnectiveList:
                breakAllSimpleConnective(s, allSimpleConnectiveList, manual_evaluation)
        else:
            allSimpleConnectiveList = ['Invalid input']

        output_data[key] = allSimpleConnectiveList

    write_output(output_data, CONSTANTS.OUTPUT_FILE, manual_evaluation)



[OK] : File ~ input.txt
[OK] : File data read.
[OK] : Output file written successfully


In [None]:
import os
import re
import string
import CONSTANTS
import logging
from wxconv import WXC

# Define connectives as dictionaries for better organization
SIMPLE_CONNECTIVES = {'और', 'एवं' ,'इसलिए', 'क्योंकि', 'जबकि' ,'तथा', 'ताकि', 'मगर', 'लेकिन', 'किंतु', 'परंतु', 'फिर',
                      'या', 'तथापि','नहीं तो', 'चूंकि', 'चूँकि', 'वरना','अन्यथा', 'बशर्तें', 'हालाँकि', 'इसीलिये', 'इसीलिए' ,
                      'इसलिए', 'अथवा', 'अतः', 'अर्थात्', 'जब', 'तो'}
COMPLEX_CONNECTIVES = {
    'क्योंकि': ['इसलिए', 'इसके कारण', 'इसलिए', 'क्योंकि'],
    # Add more complex connectives and their alternatives here
}

def log(message, log_type='OK'):
    """Generates log message in a predefined format."""
    print(f'[{log_type}] : {message}')
    if log_type == 'ERROR':
        sys.exit()

def read_input(file_path):
    """Reads and returns a dictionary with sentence_id as the key and the sentence as the value."""
    log(f'File ~ {file_path}')
    input_data = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            for line in lines:
                line = line.strip()
                if line:
                    sentence_info = line.split(' ', 1)
                    key, value = sentence_info[0], sentence_info[1].strip()
                    input_data[key] = value
            log('File data read.')
    except FileNotFoundError:
        log('No such file found.', 'ERROR')
        sys.exit()
    return input_data

def clean(word):
    """Clean concept words by removing numbers and special characters using regex."""
    return re.sub(r'[0-9!@#$%^&*()]', '', word)

def validate_sentence(sentence):
    """Validate if the sentence is not empty and contains non-digit characters."""
    return bool(sentence) and any(char.isalpha() for char in sentence)

def split_sentence(sentence):
    """Split the sentence based on both SIMPLE_CONNECTIVES and COMPLEX_CONNECTIVES."""
    connectives = '|'.join(map(re.escape, SIMPLE_CONNECTIVES | set(COMPLEX_CONNECTIVES.keys())))
    parts = re.split(f'({connectives})', sentence)
    return [part.strip() for part in parts if part.strip()]

def sanitize_input(sentence):
    wx_format = WXC(order="utf2wx", lang="hin")
    generate_wx_text = wx_format.convert(sentence)
    clean_wx_text = " ".join([clean(word) for word in generate_wx_text.strip().split()])
    hindi_format = WXC(order="wx2utf", lang="hin")
    clean_hindi_text = hindi_format.convert(clean_wx_text).strip()
    if clean_hindi_text.endswith('.'):
        clean_hindi_text = clean_hindi_text[:-1] + " ।"
    return clean_hindi_text

def write_output(dictionary, file_path, manual_evaluation):
    with open(file_path, 'w', encoding='utf-8') as file:
        for key, values in dictionary.items():
            # Initialize a counter for sub-sentences
            sub_sentence_counter = 1
            for item in values:
                tag = 'Manual evaluation' if item in manual_evaluation else 'None'
                # Each sub-sentence should end with poornaviram
                if not item.endswith('।'):
                    item += ' ।'
                # Construct the full sentence line with a sub-sentence identifier
                line = f'{key}{string.ascii_lowercase[sub_sentence_counter - 1]}  {item}  {tag}\n'
                file.write(line)
                sub_sentence_counter += 1
    log("Output file written successfully")

def separate_sentences_with_connectives(sentence):
    simpler_sentences = []
    connective_indices = []

    for connective in SIMPLE_CONNECTIVES | set(COMPLEX_CONNECTIVES.keys()):
        if connective in sentence:
            indices = [m.start() for m in re.finditer(connective, sentence)]
            connective_indices.extend((i, i + len(connective)) for i in indices)

    connective_indices.sort()
    start = 0
    for i, j in connective_indices:
        part = sentence[start:i].strip()
        if part:
            simpler_sentences.append(part)
        start = j

    last_part = sentence[start:].strip()
    if last_part:
        simpler_sentences.append(last_part)

    return simpler_sentences

# Other functions and main code remain mostly the same as the previous response.


def is_prev_word_verb(parser_output, index):
    try:
        with open(parser_output, 'r') as file:
            lines = file.readlines()
            for i in range(len(lines)):
                if i == index:
                    lineContent = lines[i].strip().split()
                    if len(lineContent) > 0 and (lineContent[1] == 'VM' or lineContent[1] == 'VAUX'):
                        return True

    except FileNotFoundError:
        log('No such File found.', 'ERROR')
        sys.exit()
    return False

def get_index_of_word(words, value):
    index = -1
    for i in range(len(words)):
        if words[i] == value:
            index = i
            break
    return index

def get_word_at_index(words, index):
    word = ""
    for i in range(len(words)):
        if i == index:
            word = words[i]
            break
    return word

def get_POS_by_index(parser_output, index):
    tag = ''
    try:
        with open(parser_output, 'r') as file:
            lines = file.readlines()
            for i in range(len(lines)):
                row = lines[i].strip().split()
                if len(row) == 10 and row[0] == str(index + 1):
                    tag = row[3]
                    break
            return tag

    except FileNotFoundError:
        log('No such File found.', 'ERROR')
        sys.exit()

def get_dep_by_index(parser_output, index):
    dep = ''
    try:
        with open(parser_output, 'r') as file:
            lines = file.readlines()
            for i in range(len(lines)):
                row = lines[i].strip().split()
                if len(row) == 10 and row[0] == str(index+1):
                    dep = row[7]
                    break
            return dep

    except FileNotFoundError:
        log('No such File found.', 'ERROR')
        sys.exit()

def breakPairConnective(sentence, manual_evaluation):
    # This function return list of sentences if a paired connective is found else returns an empty list
    simpler_sentences = []
    BREAK_SENTENCE = False
    # Tokenize the sentence by splitting it into words
    tokens = sentence.split()
    # Iterate through the tokens to find connectives and split the sentence
    for i in range(len(tokens)):
        token = tokens[i]
        # Check if the token is a paired-connective
        if token in CONSTANTS.COMPLEX_CONNECTIVES:
            pair_value_lst = CONSTANTS.COMPLEX_CONNECTIVES[token]
            for pair_value in pair_value_lst:
                if pair_value in sentence:
                    pair_value = pair_value.strip().split()[0]
                    index_of_pair_value = get_index_of_word(tokens, pair_value)
                    if not (index_of_pair_value == -1):
                        get_tagger_output(sentence)
                        if is_prev_word_verb(CONSTANTS.PARSER_OUTPUT, index_of_pair_value - 1):
                            tokens.pop(i)
                            index_of_pair_value = index_of_pair_value - 1
                            sent1 = tokens[:index_of_pair_value]
                            sent2 = tokens[index_of_pair_value:]
                            simpler_sentences.append(" ".join(sent1))
                            simpler_sentences.append(" ".join(sent2))
                            BREAK_SENTENCE = True
                            break
                        else:
                            manual_evaluation.append(sentence)
            if BREAK_SENTENCE:
                break
    return simpler_sentences

def breakSimpleConnective(sentence, manual_evaluation):
    # This function returns a list of sentences if a simple connective is found; otherwise, it returns an empty list
    simpler_sentences = []
    # Tokenize the sentence by splitting it into words
    tokens = sentence.split()
    for i in range(len(tokens)):  # Use range(len(tokens)) to iterate through indices
        token = tokens[i]
        # 'नहीं तो' is a simple connective
        if token == 'नहीं':
            following_word = get_word_at_index(tokens, i + 1)
            if following_word == 'तो':
                token = 'नहीं तो'

        # Check if the token is a connective
        if token in CONSTANTS.SIMPLE_CONNECTIVES:
            if token == 'और' or token == 'एवं' or token == 'तथा' or token == 'या':
                get_parser_output(sentence)
                token_POS = get_POS_by_index(CONSTANTS.PARSER_OUTPUT, i)
                token_dep = get_dep_by_index(CONSTANTS.PARSER_OUTPUT, i)
                get_tagger_output(sentence)
                if token_POS == 'CC' and token_dep == 'main' and is_prev_word_verb(CONSTANTS.PARSER_OUTPUT, i - 1):
                    sent1 = tokens[:i]
                    sent2 = tokens[i:]
                    simpler_sentences.append(" ".join(sent1))
                    simpler_sentences.append(" ".join(sent2))
                    break
                elif i > 1:
                    manual_evaluation.append(sentence)

            else:
                get_tagger_output(sentence)
                if is_prev_word_verb(CONSTANTS.PARSER_OUTPUT, i - 1):
                    sent1 = tokens[:i]
                    sent2 = tokens[i:]
                    simpler_sentences.append(" ".join(sent1))
                    simpler_sentences.append(" ".join(sent2))
                    break
                elif i > 1:
                    manual_evaluation.append(sentence)

    return simpler_sentences


def write_input_in_parser_input(file_path, sentence):
    with open(file_path, 'w') as file:
        file.truncate()
        file.write(sentence)
        file.close()

def get_tagger_output(sentence):
    parser_input_file = CONSTANTS.PARSER_INPUT
    write_input_in_parser_input(parser_input_file, sentence)
    with open(CONSTANTS.PARSER_OUTPUT, 'w') as file:
        file.truncate()
    os.system("isc-tagger -i p_parser_input.txt -o p_parser_output.txt")

def get_parser_output(sentence):
    parser_input_file = CONSTANTS.PARSER_INPUT
    write_input_in_parser_input(parser_input_file, sentence)
    with open(CONSTANTS.PARSER_OUTPUT, 'w') as file:
        file.truncate()
    os.system("isc-parser -i p_parser_input.txt -o p_parser_output.txt")

def breakAllPairedConnective(sentence, allPairedConnectiveList, manual_evaluation):
    simpler_sentences = breakPairConnective(sentence, manual_evaluation)
    if len(simpler_sentences) == 0:
        allPairedConnectiveList.append(sentence)
        return

    for s in simpler_sentences:
        breakAllPairedConnective(s, allPairedConnectiveList, manual_evaluation)

    return

def breakAllSimpleConnective(sentence, allSimpleConnectiveList, manual_evaluation):
    simpler_sentences = breakSimpleConnective(sentence, manual_evaluation)
    if len(simpler_sentences) == 0:
        allSimpleConnectiveList.append(sentence)
        return

    for s in simpler_sentences:
        breakAllSimpleConnective(s, allSimpleConnectiveList, manual_evaluation)

    return
if __name__ == '__main__':
    input_data = read_input(CONSTANTS.INPUT_FILE)
    output_data = {}
    manual_evaluation = []

    for key, value in input_data.items():
        if validate_sentence(value):
            value = sanitize_input(value)
            simpler_sentences = separate_sentences_with_connectives(value)

            # First break the sentence by pair connectives
            allPairedConnectiveList = []
            for s in simpler_sentences:
                breakAllPairedConnective(s, allPairedConnectiveList, manual_evaluation)
            allSimpleConnectiveList = []

            for s in allPairedConnectiveList:
                breakAllSimpleConnective(s, allSimpleConnectiveList, manual_evaluation)
        else:
            allSimpleConnectiveList = ['Invalid input']

        output_data[key] = allSimpleConnectiveList

    write_output(output_data, CONSTANTS.OUTPUT_FILE, manual_evaluation)





[OK] : File ~ input.txt
[OK] : File data read.
[OK] : Output file written successfully
