**for Cornell Movie Dialogs Corpus 20 words or less**

In [11]:
# File paths
movie_lines_file = r"C:\Users\avikd\Downloads\Compressed\archive_2\movie_lines.txt"  # Update with your actual path
movie_conversations_file = r"C:\Users\avikd\Downloads\Compressed\archive_2\movie_conversations.txt"  # Update with your actual path

In [18]:
import re

# Word list with intended meanings
target_words = {
    "bat": "a flying mammal or a sports bat",
    "cup": "a vessel used for drinking",
    "drop": "a small amount of liquid or to let something fall",
    "eat": "to consume food",
    "fish": "an aquatic animal",
    "hot": "something with high temperature or spicy flavor",
    "jump": "to leap into the air",
    "milk": "a dairy product",
    "pen": "a tool for writing",
    "red": "a primary color"
}

# Load movie_lines.txt
def load_movie_lines(file_path):
    movie_lines = {}
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            if len(parts) == 5:
                line_id = parts[0]
                text = parts[-1].strip()
                movie_lines[line_id] = text
    return movie_lines

# Load movie_conversations.txt
def load_movie_conversations(file_path):
    conversations = []
    with open(file_path, 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            if len(parts) == 4:
                utterance_ids = eval(parts[-1])  # List of line IDs
                conversations.append(utterance_ids)
    return conversations

# Reconstruct conversations using movie_lines.txt
def reconstruct_conversations(movie_lines, conversations):
    reconstructed_conversations = []
    for convo in conversations:
        convo_text = []
        for line_id in convo:
            if line_id in movie_lines:
                convo_text.append(movie_lines[line_id])
        if convo_text:
            reconstructed_conversations.append(convo_text)
    return reconstructed_conversations

# Function to split multi-sentence text and retain only relevant parts
def extract_relevant_sentence(text, target_words):
    # Split the text into individual sentences, keeping the punctuation
    sentences = re.findall(r'[^.!?]*[.!?]', text)  # Match sentences with their ending punctuation
    relevant_sentences = []

    # Check each sentence for target words
    for sentence in sentences:
        for word in target_words:
            if re.search(rf"\b{word}\b", sentence.lower()):
                relevant_sentences.append(sentence.strip())
                break  # Stop checking once a match is found for this sentence

    # Join the relevant sentences back together (if any)
    return " ".join(relevant_sentences).strip()

# Update filter_conversations function
def filter_conversations(conversations, target_words, max_words=20):
    filtered_sentences = {word: {'statements': [], 'questions': []} for word in target_words.keys()}
    for convo in conversations:
        for text in convo:
            # Extract the relevant part of the text
            relevant_text = extract_relevant_sentence(text, target_words.keys())

            # Skip if no relevant sentences are found
            if not relevant_text:
                continue

            # Check word count
            if len(relevant_text.split()) > max_words:
                continue

            # Categorize as statement or question
            for word, meaning in target_words.items():
                if re.search(rf"\b{word}\b", relevant_text.lower()):
                    if relevant_text.endswith('?'):
                        filtered_sentences[word]['questions'].append(relevant_text)
                    else:
                        filtered_sentences[word]['statements'].append(relevant_text)
    return filtered_sentences

# # File paths
# movie_lines_file = "movie_lines.txt"  # Replace with your actual file path
# movie_conversations_file = "movie_conversations.txt"  # Replace with your actual file path

# Load data
print("Loading data...")
movie_lines = load_movie_lines(movie_lines_file)
movie_conversations = load_movie_conversations(movie_conversations_file)

# Reconstruct conversations
print("Reconstructing conversations...")
reconstructed_conversations = reconstruct_conversations(movie_lines, movie_conversations)

# Filter sentences
print("Filtering sentences...")
filtered_sentences = filter_conversations(reconstructed_conversations, target_words)

# Save results to a file with UTF-8 encoding
output_file = "filtered_movie_sentences.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for word, data in filtered_sentences.items():
        f.write(f"Word: {word}\n")
        f.write("Statements:\n")
        for statement in data['statements']:
            f.write(f"- {statement}\n")
        f.write("\nQuestions:\n")
        for question in data['questions']:
            f.write(f"- {question}\n")
        f.write("\n")

print(f"Filtered sentences saved to '{output_file}'.")

Loading data...
Reconstructing conversations...
Filtering sentences...
Filtered sentences saved to 'filtered_movie_sentences.txt'.


**for DailyDialog**

In [43]:
# File paths
train_path = r'C:\Users\avikd\Downloads\Compressed\archive_3\train.csv'
validation_path = r'C:\Users\avikd\Downloads\Compressed\archive_3\validation.csv'
test_path = r'C:\Users\avikd\Downloads\Compressed\archive_3\test.csv'

In [49]:
import pandas as pd
import re

# Define target words with intended meanings
target_words = {
    "bat": "a flying mammal or a sports bat",
    "cup": "a vessel used for drinking",
    "drop": "a small amount of liquid or to let something fall",
    "eat": "to consume food",
    "fish": "an aquatic animal",
    "hot": "something with high temperature or spicy flavor",
    "jump": "to leap into the air",
    "milk": "a dairy product",
    "pen": "a tool for writing",
    "red": "a primary color"
}

# Function to clean and format sentences/questions
def clean_sentence(sentence):
    # Remove unwanted leading and trailing characters
    sentence = sentence.strip()
    sentence = re.sub(r"^[\[\]'\" ]+", "", sentence)  # Remove leading brackets, quotes, and spaces
    sentence = re.sub(r"[\[\]'\" ]+$", "", sentence)  # Remove trailing brackets, quotes, and spaces

    # Replace multiple spaces with a single space
    sentence = re.sub(r"\s+", " ", sentence)  # Standardize whitespace

    # Return None for invalid or empty sentences
    if not sentence or sentence in ["'", '"', "-", "[", "]"]:
        return None

    return sentence

# Function to filter sentences/questions based on target words and length
def filter_sentences(dialogs, target_words, max_words=20):
    filtered_sentences = {word: {'statements': [], 'questions': []} for word in target_words.keys()}

    for dialog in dialogs:
        # Parse dialog string into individual sentences
        sentences = re.findall(r'[^.!?]*[.!?]', dialog)  # Retain punctuation

        for sentence in sentences:
            # Clean the sentence
            cleaned_sentence = clean_sentence(sentence)
            if not cleaned_sentence:
                continue  # Skip invalid or empty sentences

            # Check for word count limit
            if len(cleaned_sentence.split()) > max_words:
                continue  # Skip sentences longer than the word limit

            # Check for target words
            for word in target_words.keys():
                if re.search(rf"\b{word}\b", cleaned_sentence.lower()):
                    # Categorize as statement or question
                    if cleaned_sentence.endswith('?'):
                        filtered_sentences[word]['questions'].append(cleaned_sentence)
                    else:
                        filtered_sentences[word]['statements'].append(cleaned_sentence)

    return filtered_sentences

# Load and process each file
def process_csv(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path, header=None, encoding='utf-8')

    # Extract the dialog column
    dialogs = data.iloc[:, 0]  # Assuming dialogs are in the first column
    return dialogs

# Combine results from all datasets
def combine_results(*datasets):
    combined_results = {word: {'statements': [], 'questions': []} for word in target_words.keys()}

    for dataset in datasets:
        for word in target_words.keys():
            combined_results[word]['statements'].extend(dataset[word]['statements'])
            combined_results[word]['questions'].extend(dataset[word]['questions'])

    return combined_results

# File paths
train_path = r'C:\Users\avikd\Downloads\Compressed\archive_3\train.csv'
validation_path = r'C:\Users\avikd\Downloads\Compressed\archive_3\validation.csv'
test_path = r'C:\Users\avikd\Downloads\Compressed\archive_3\test.csv'

# Process datasets
print("Processing datasets...")
train_dialogs = process_csv(train_path)
validation_dialogs = process_csv(validation_path)
test_dialogs = process_csv(test_path)

# Apply filtering
print("Filtering dialogs...")
filtered_train = filter_sentences(train_dialogs, target_words, max_words=20)
filtered_validation = filter_sentences(validation_dialogs, target_words, max_words=20)
filtered_test = filter_sentences(test_dialogs, target_words, max_words=20)

# Combine results
print("Combining results...")
combined_results = combine_results(filtered_train, filtered_validation, filtered_test)

# Save the filtered results to a file
output_file = "filtered_daily_dialog.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for word, data in combined_results.items():
        f.write(f"Word: {word}\n")
        f.write("Statements:\n")
        for statement in data['statements']:
            cleaned_statement = clean_sentence(statement)
            if cleaned_statement:  # Ensure the statement is valid after cleaning
                f.write(f"- {cleaned_statement}\n")  # Single dash followed by one space
        f.write("\nQuestions:\n")
        for question in data['questions']:
            cleaned_question = clean_sentence(question)
            if cleaned_question:  # Ensure the question is valid after cleaning
                f.write(f"- {cleaned_question}\n")  # Single dash followed by one space
        f.write("\n")

print(f"Filtered dialogs saved to '{output_file}'.")


Processing datasets...
Filtering dialogs...
Combining results...
Filtered dialogs saved to 'filtered_daily_dialog.txt'.


**for QuAC (Question Answering in Context) 20 words or less**

In [23]:
# File paths for train and validation JSON files
train_file_path = r'C:\Users\avikd\Downloads\Compressed\archive_4\train_v2.json'
validation_file_path = r'C:\Users\avikd\Downloads\Compressed\archive_4\val_v2.json'

In [26]:
import json
import re

# Define the target words with their intended meanings
target_words = {
    "bat": "a flying mammal or a sports bat",
    "cup": "a vessel used for drinking",
    "drop": "a small amount of liquid or to let something fall",
    "eat": "to consume food",
    "fish": "an aquatic animal",
    "hot": "something with high temperature or spicy flavor",
    "jump": "to leap into the air",
    "milk": "a dairy product",
    "pen": "a tool for writing",
    "red": "a primary color"
}

# Function to extract relevant sentences containing target words
def extract_relevant_sentences(text, target_words, max_words=20):
    sentences = re.findall(r'[^.!?]*[.!?]', text)  # Split text into individual sentences
    relevant_sentences = []
    for sentence in sentences:
        if len(sentence.split()) > max_words:
            continue  # Skip sentences with more than max_words
        for word in target_words.keys():
            if re.search(rf"\b{word}\b", sentence.lower()):
                relevant_sentences.append(sentence.strip())
                break  # Include the sentence only once per match
    return relevant_sentences

# Function to process a QuAC JSON file and extract relevant sentences/questions
def process_quac_file(file_path, target_words, max_words=20):
    filtered_sentences = {word: {'statements': [], 'questions': []} for word in target_words.keys()}
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for entry in data['data']:
        for paragraph in entry['paragraphs']:
            # Extract sentences from context
            context_sentences = extract_relevant_sentences(paragraph['context'], target_words, max_words)
            for sentence in context_sentences:
                for word in target_words.keys():
                    if re.search(rf"\b{word}\b", sentence.lower()):
                        if sentence.endswith('?'):
                            filtered_sentences[word]['questions'].append(sentence)
                        else:
                            filtered_sentences[word]['statements'].append(sentence)

            # Extract sentences from questions
            for qa in paragraph['qas']:
                question_sentences = extract_relevant_sentences(qa['question'], target_words, max_words)
                for sentence in question_sentences:
                    for word in target_words.keys():
                        if re.search(rf"\b{word}\b", sentence.lower()):
                            filtered_sentences[word]['questions'].append(sentence)

    return filtered_sentences

# Combine results from train and validation files into one
def combine_results(train_file, validation_file, target_words, max_words=20):
    print("Processing training data...")
    train_results = process_quac_file(train_file, target_words, max_words)

    print("Processing validation data...")
    validation_results = process_quac_file(validation_file, target_words, max_words)

    combined_results = {word: {'statements': [], 'questions': []} for word in target_words.keys()}
    for word in target_words.keys():
        combined_results[word]['statements'] = train_results[word]['statements'] + validation_results[word]['statements']
        combined_results[word]['questions'] = train_results[word]['questions'] + validation_results[word]['questions']

    return combined_results

# File paths for train and validation JSON files
# train_file_path = '/path/to/train_v0.2.json'
# validation_file_path = '/path/to/val_v0.2.json'

# Process and combine the results
print("Combining results from train and validation...")
combined_results = combine_results(train_file_path, validation_file_path, target_words)

# Save combined results to a single text file
output_file = "filtered_quac_combined_20words_or_less.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for word, data in combined_results.items():
        f.write(f"Word: {word}\n")
        f.write("Statements:\n")
        for statement in data['statements']:
            f.write(f"- {statement}\n")
        f.write("\nQuestions:\n")
        for question in data['questions']:
            f.write(f"- {question}\n")
        f.write("\n")

print(f"Filtered results saved to '{output_file}'.")


Combining results from train and validation...
Processing training data...
Processing validation data...
Filtered results saved to 'filtered_quac_combined_20words_or_less.txt'.


**for Persona-Chat 20 words or less**

In [29]:
from datasets import load_dataset
import re

# Define the target words with their intended meanings
target_words = {
    "bat": "a flying mammal or a sports bat",
    "cup": "a vessel used for drinking",
    "drop": "a small amount of liquid or to let something fall",
    "eat": "to consume food",
    "fish": "an aquatic animal",
    "hot": "something with high temperature or spicy flavor",
    "jump": "to leap into the air",
    "milk": "a dairy product",
    "pen": "a tool for writing",
    "red": "a primary color"
}

# Function to extract relevant sentences containing target words
def extract_relevant_sentences(text, target_words, max_words=20):
    if not isinstance(text, str):
        return []  # Skip non-string entries
    sentences = re.findall(r'[^.!?]*[.!?]', text)  # Split text into individual sentences
    relevant_sentences = []
    for sentence in sentences:
        if len(sentence.split()) > max_words:
            continue  # Skip sentences with more than max_words
        for word in target_words.keys():
            if re.search(rf"\b{word}\b", sentence.lower()):
                relevant_sentences.append(sentence.strip())
                break  # Include the sentence only once per match
    return relevant_sentences

# Process a Persona-Chat dataset split and extract relevant data
def process_persona_chat(dataset_split, target_words, max_words=20):
    filtered_sentences = {word: {'statements': [], 'questions': []} for word in target_words.keys()}
    for dialog in dataset_split:
        # Process persona descriptions and utterances
        all_text = []
        if 'personality' in dialog and isinstance(dialog['personality'], list):
            all_text.extend(dialog['personality'])
        if 'utterances' in dialog and isinstance(dialog['utterances'], list):
            all_text.extend([utt['text'] for utt in dialog['utterances'] if 'text' in utt])

        for text in all_text:
            # Extract relevant sentences
            relevant_sentences = extract_relevant_sentences(text, target_words, max_words)
            for sentence in relevant_sentences:
                for word in target_words.keys():
                    if re.search(rf"\b{word}\b", sentence.lower()):
                        if sentence.endswith('?'):
                            filtered_sentences[word]['questions'].append(sentence)
                        else:
                            filtered_sentences[word]['statements'].append(sentence)
    return filtered_sentences

# Load the Persona-Chat dataset from Hugging Face
print("Downloading the Persona-Chat dataset...")
dataset = load_dataset('AlekseyKorshuk/persona-chat')

# Process both train and validation splits
print("Processing training data...")
train_results = process_persona_chat(dataset['train'], target_words)

print("Processing validation data...")
validation_results = process_persona_chat(dataset['validation'], target_words)

# Combine results from train and validation
combined_results = {word: {'statements': [], 'questions': []} for word in target_words.keys()}
for word in target_words.keys():
    combined_results[word]['statements'] = train_results[word]['statements'] + validation_results[word]['statements']
    combined_results[word]['questions'] = train_results[word]['questions'] + validation_results[word]['questions']

# Save results to a single text file
output_file = "filtered_persona_chat.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for word, data in combined_results.items():
        f.write(f"Word: {word}\n")
        f.write("Statements:\n")
        for statement in data['statements']:
            f.write(f"- {statement}\n")
        f.write("\nQuestions:\n")
        for question in data['questions']:
            f.write(f"- {question}\n")
        f.write("\n")

print(f"Filtered results saved to '{output_file}'.")


Downloading the Persona-Chat dataset...
Processing training data...
Processing validation data...
Filtered results saved to 'filtered_persona_chat.txt'.


In [1]:
import pandas as pd
import os

# Input and output file paths
input_file = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data\4_Persona_Chat.csv"  # Replace with your input file path
output_file = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data\output.csv"  # Replace with your desired output file path

# Load the CSV file
data = pd.read_csv(input_file)

# Ensure the CSV has at least two columns
if len(data.columns) < 2:
    print("The CSV file does not have enough columns.")
else:
    # Extract the column names
    word_column = data.columns[0]
    sentence_column = data.columns[1]

    # Capitalize the first letter of each sentence/question in the second column
    data[sentence_column] = data[sentence_column].apply(lambda x: x[0].upper() + x[1:] if isinstance(x, str) else x)

    # Save the updated data to a new CSV file
    data.to_csv(output_file, index=False)
    print(f"Processed file saved to {output_file}.")

Processed file saved to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data\output.csv.


**for OpenSubtitle 20 words or less**

In [31]:
# File path to the en_txt file
file_path = r'C:\Users\avikd\Downloads\Compressed\en_txt\en.txt'

In [32]:
import re

# Define the target words with their intended meanings
target_words = {
    "bat": "a flying mammal or a sports bat",
    "cup": "a vessel used for drinking",
    "drop": "a small amount of liquid or to let something fall",
    "eat": "to consume food",
    "fish": "an aquatic animal",
    "hot": "something with high temperature or spicy flavor",
    "jump": "to leap into the air",
    "milk": "a dairy product",
    "pen": "a tool for writing",
    "red": "a primary color"
}

# Function to extract relevant sentences containing target words
def extract_relevant_sentences(file_path, target_words, max_words=20):
    filtered_sentences = {word: {'statements': [], 'questions': []} for word in target_words.keys()}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Strip whitespace and skip empty lines
            line = line.strip()
            if not line:
                continue

            # Split text into sentences
            sentences = re.findall(r'[^.!?]*[.!?]', line)  # Match sentences with their punctuation
            for sentence in sentences:
                # Skip sentences longer than max_words
                if len(sentence.split()) > max_words:
                    continue

                # Check for target words
                for word in target_words.keys():
                    if re.search(rf"\b{word}\b", sentence.lower()):
                        if sentence.endswith('?'):
                            filtered_sentences[word]['questions'].append(sentence.strip())
                        else:
                            filtered_sentences[word]['statements'].append(sentence.strip())
                        break  # Avoid duplicating the sentence for multiple word matches
    return filtered_sentences

# # File path to the en_txt file
# file_path = '/path/to/en_txt'

# Extract relevant sentences
print("Processing the subtitles file...")
filtered_data = extract_relevant_sentences(file_path, target_words)

# Save the filtered sentences to a file
output_file = "filtered_open_subtitles.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for word, data in filtered_data.items():
        f.write(f"Word: {word}\n")
        f.write("Statements:\n")
        for statement in data['statements']:
            f.write(f"- {statement}\n")
        f.write("\nQuestions:\n")
        for question in data['questions']:
            f.write(f"- {question}\n")
        f.write("\n")

print(f"Filtered results saved to '{output_file}'.")


Processing the subtitles file...
Filtered results saved to 'filtered_open_subtitles.txt'.


**converting extracted text files into csv**

In [51]:
# Folder paths
input_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\1_raw_data"  # Replace with the folder containing your .txt files
output_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data"  # Replace with your desired output folder for .csv files

In [52]:
import os
import csv

# Function to parse a text file and prepare rows for CSV
def convert_text_to_csv(input_file):
    rows = []  # List to store rows for the CSV
    skipped_lines = []  # List to store skipped lines for logging
    current_word = None

    with open(input_file, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, 1):  # Keep track of line numbers
            line = line.strip()

            if line.startswith("Word:"):
                # Extract the current word
                current_word = line.replace("Word:", "").strip()
                if not current_word:
                    skipped_lines.append((line_number, line))  # Log empty word
            elif line.startswith("-") and current_word:
                # Extract the sentence/question
                sentence = line[1:].strip()
                if sentence:
                    rows.append([current_word, sentence])  # Add to rows
                else:
                    skipped_lines.append((line_number, line))  # Log empty sentence
            elif line:  # Non-empty line that doesn't match expected formats
                skipped_lines.append((line_number, line))  # Log unexpected line

    return rows, skipped_lines

# Function to process all text files in a folder and save as CSV
def process_folder_to_csv(input_folder, output_folder, log_file="skipped_lines.log"):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Prepare log file
    log_path = os.path.join(output_folder, log_file)
    with open(log_path, 'w', encoding='utf-8') as log:
        log.write("Skipped Lines Log:\n")
        log.write("=================\n")

    # Iterate through all .txt files in the folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".txt"):
            input_file = os.path.join(input_folder, file_name)
            output_file = os.path.join(output_folder, file_name.replace(".txt", ".csv"))

            # Convert the text file to CSV rows
            print(f"Processing {file_name}...")
            csv_rows, skipped_lines = convert_text_to_csv(input_file)

            # Save the rows to a CSV file
            with open(output_file, 'w', encoding='utf-8', newline='') as csv_file:
                writer = csv.writer(csv_file)
                # Write header
                writer.writerow(["Word", "Sentence/Question"])
                # Write data rows
                writer.writerows(csv_rows)

            print(f"Saved CSV as {output_file}\n")

            # Log skipped lines
            with open(log_path, 'a', encoding='utf-8') as log:
                if skipped_lines:
                    log.write(f"\nFile: {file_name}\n")
                    for line_number, line in skipped_lines:
                        log.write(f"Line {line_number}: {line}\n")
                else:
                    log.write(f"\nFile: {file_name} - No skipped lines\n")

    print(f"\nProcessing completed! Check '{log_path}' for skipped lines.")

# Process all .txt files in the input folder
process_folder_to_csv(input_folder, output_folder)


Processing 1_Cornell Movie Dialogs Corpus.txt...
Saved CSV as D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\1_Cornell Movie Dialogs Corpus.csv

Processing 2_DailyDialog.txt...
Saved CSV as D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\2_DailyDialog.csv

Processing 3_QuAC (Question Answering in Context).txt...
Saved CSV as D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\3_QuAC (Question Answering in Context).csv

Processing 4_Persona_Chat.txt...
Saved CSV as D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\4_Persona_Chat.csv

Processing 5_OpenSubtitles.txt...
Saved CSV as D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\5_OpenSubtitles.csv


Processing completed! Check 'D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\skipped_lines.log' for skipped lines.


In [53]:
# Input and output folder paths
input_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data"  # Folder containing the input CSV files
output_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\3_processed_csv_data"  # Folder to save the processed CSV files
log_file_name = "deleted_rows_log.txt"  # Log file for deleted rows

In [59]:
import os
import pandas as pd
import re

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Path for the deleted rows log file
log_file_path = os.path.join(output_folder, log_file_name)

# Initialize a list to store deleted rows for logging
deleted_rows_log = []

# Function to clean the Sentence/Question column
def clean_sentence(sentence):
    # Remove unwanted leading characters
    sentence = re.sub(r"^[-\[\]: ]+", "", sentence.strip())  # Remove unwanted starts
    return sentence.strip()

# Function to check for unrecognized content
def is_unrecognized_content(value):
    return value.strip().lower() in ["#name?", "#value!", "nan", ""]

# Function to process CSV files
def process_csv(file_path, output_path):
    # Load the CSV file
    data = pd.read_csv(file_path, encoding="utf-8")

    # Ensure the 2nd column (Sentence/Question) exists
    if len(data.columns) < 2:
        print(f"File {file_path} does not have enough columns. Skipping...")
        return

    # Extract the Sentence/Question column (2nd column)
    column_name = data.columns[1]  # Assume the 2nd column contains sentences/questions

    # Initialize a DataFrame for rows to delete
    rows_to_delete = []

    # Clean the Sentence/Question column
    for index, row in data.iterrows():
        sentence = str(row[column_name])

        # Check for unrecognized content
        if is_unrecognized_content(sentence):
            rows_to_delete.append(row)
            continue

        # Clean the sentence
        cleaned_sentence = clean_sentence(sentence)

        # Skip rows with invalid starts or empty content after cleaning
        if not cleaned_sentence or len(cleaned_sentence.split()) < 3:
            rows_to_delete.append(row)
            continue

        # Update the sentence in the DataFrame
        data.at[index, column_name] = cleaned_sentence

    # Remove rows to delete
    rows_to_delete_df = pd.DataFrame(rows_to_delete, columns=data.columns)
    data = data.drop(rows_to_delete_df.index)

    # Remove duplicates
    before_deduplication = len(data)
    data = data.drop_duplicates(subset=[column_name], keep="first")
    duplicates_removed = before_deduplication - len(data)

    # Log deleted rows
    deleted_rows_log.extend(rows_to_delete_df.values.tolist())

    # Save the cleaned CSV
    data.to_csv(output_path, index=False, encoding="utf-8")
    print(f"Processed {file_path}. Saved cleaned data to {output_path}.")
    print(f"Rows removed: {len(rows_to_delete)}, Duplicates removed: {duplicates_removed}\n")

# Process all CSV files in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, file_name)
        process_csv(input_path, output_path)

# Save the log of deleted rows
with open(log_file_path, "w", encoding="utf-8") as log_file:
    log_file.write("Deleted Rows Log\n")
    log_file.write("================\n")
    for row in deleted_rows_log:
        log_file.write(f"{row}\n")

print(f"Deleted rows log saved to {log_file_path}.")


Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\1_Cornell Movie Dialogs Corpus.csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\3_processed_csv_data\1_Cornell Movie Dialogs Corpus.csv.
Rows removed: 124, Duplicates removed: 92

Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\2_DailyDialog.csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\3_processed_csv_data\2_DailyDialog.csv.
Rows removed: 5, Duplicates removed: 260

Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\3_QuAC (Question Answering in Context).csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\3_processed_csv_data\3_QuAC (Question Answering in Context).csv.
Rows removed: 0, Duplicates removed: 559

Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\2_csv_data\4_Persona_Chat.csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\3_processed_csv_data\4_Persona_Chat.csv.
Rows removed: 0, Duplicates

In [None]:
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------------

**Cleaning extracted sentences/question from the datasets**

In [13]:
import os
import pandas as pd
import re
import unicodedata

# Input and output folder paths
input_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data"  # Folder containing input CSV files
output_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\3_processed_csv"  # Folder to save the processed CSV files

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to clean subtitle text
def clean_subtitle_text(text):
    text = unicodedata.normalize('NFKD', text)
    text = text.replace('â€™', "'").replace('â€œ', '"').replace('â€�', '"').replace('â€“', '-').replace('Ã©', 'e')
    contractions = {"don t": "don't", "can t": "can't", "i m": "i'm", "it s": "it's", "you re": "you're",
                    "we re": "we're", "they re": "they're"}
    for k, v in contractions.items():
        text = re.sub(rf'\b{k}\b', v, text)
    text = re.sub(r"[^a-zA-Z0-9.,!?'\"]+", " ", text)
    text = re.sub(r" (\.|\?|!)", r"\1", text)
    return re.sub(r'\s+', ' ', text).strip()

# Function to check for unwanted patterns
def has_unwanted_patterns(text):
    if "#NAME?" in text:
        return True
    if re.search(r"\d+(-\d+)+-?", text):
        return True
    if re.search(r'(\'{2,}|"{"2,})', text):
        return True
    if re.search(r"<[^>]+>", text):
        return True
    if re.search(r"\{[^}]*\}|\[[^\]]*\]|\([^)]*\)", text):
        return True
    if "Nº" in text:
        return True
    return False

# Function to check for invalid start conditions
def starts_with_invalid_character(text):
    if re.match(r"^[-_]", text):
        return True
    if re.match(r"^\d", text):
        return True
    if re.match(r"^[a-z]", text):
        return True
    return False

# Function to check for invalid start or end conditions
def has_invalid_start_or_end(text):
    if re.match(r"^[^a-zA-Z0-9.!?]", text) or re.search(r"[^a-zA-Z0-9.!?]$", text):
        return True
    if re.match(r"^[\"']", text) or re.search(r"[\"']$", text):
        return True
    return False

# Function to check if the word in column 1 is repeated in column 2
def is_word_repeated(word, sentence):
    word_count = len(re.findall(rf"\b{word.lower()}\b", sentence.lower()))
    return word_count > 1

# Function to remove unwanted special characters
def remove_unwanted_characters(text):
    text = re.sub(r"\b#\d+\b", "", text)
    text = text.replace("Â", "").replace("*", "")
    return text.strip()

# Function to check for multiple sentences/questions in a cell
def contains_multiple_sentences(text):
    return len(re.findall(r"[.!?]", text)) > 1

# Function to check for all-uppercase sentences/questions
def is_all_uppercase(sentence):
    words = sentence.split()
    return all(word.isupper() for word in words)

# Function to count words more accurately
def count_words(sentence):
    # Remove leading/trailing spaces and split based on whitespace
    words = sentence.strip().split()
    # Consider words as valid if they contain alphanumeric characters
    valid_words = [word for word in words if re.search(r"[a-zA-Z0-9]", word)]
    return len(valid_words)

# Function to process each CSV file
def process_csv(file_path, output_path):
    data = pd.read_csv(file_path, encoding="utf-8")
    if len(data.columns) < 2:
        print(f"File {file_path} does not have enough columns. Skipping...")
        return

    word_column = data.columns[0]
    sentence_column = data.columns[1]
    rows_to_delete = []
    unique_sentences = set()

    for index, row in data.iterrows():
        word = str(row[word_column])
        sentence = str(row[sentence_column])
        if has_unwanted_patterns(sentence):
            rows_to_delete.append(index)
            continue
        if starts_with_invalid_character(sentence):
            rows_to_delete.append(index)
            continue
        if has_invalid_start_or_end(sentence):
            rows_to_delete.append(index)
            continue
        if contains_multiple_sentences(sentence):
            rows_to_delete.append(index)
            continue
        if is_word_repeated(word, sentence):
            rows_to_delete.append(index)
            continue
        if count_words(sentence) > 10 or count_words(sentence) < 4:   # Updated logic in the main loop
            rows_to_delete.append(index)
            continue
        if is_all_uppercase(sentence):  # Check for all-uppercase sentences
            rows_to_delete.append(index)
            continue
        if sentence in unique_sentences:
            rows_to_delete.append(index)
            continue
        unique_sentences.add(sentence)
        cleaned_sentence = clean_subtitle_text(remove_unwanted_characters(sentence))
        data.at[index, sentence_column] = cleaned_sentence

    data = data.drop(rows_to_delete)
    data.to_csv(output_path, index=False, encoding="utf-8")
    print(f"Processed {file_path}. Saved cleaned data to {output_path}. Rows removed: {len(rows_to_delete)}.\n")

# Process all CSV files in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, file_name)
        process_csv(input_path, output_path)

print("Processing completed. All cleaned files are saved in the output folder.")

Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data\1_Cornell Movie Dialogs Corpus.csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\3_processed_csv\1_Cornell Movie Dialogs Corpus.csv. Rows removed: 1212.

Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data\2_DailyDialog.csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\3_processed_csv\2_DailyDialog.csv. Rows removed: 718.

Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data\3_QuAC (Question Answering in Context).csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\3_processed_csv\3_QuAC (Question Answering in Context).csv. Rows removed: 1310.

Processed D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\2_csv_data\4_Persona_Chat.csv. Saved cleaned data to D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\3_processed_csv\4_Persona_Chat.csv. Rows removed: 2301.

Processed D:\Pychar

**from processed csv file extract sentence with listed words and saved as .txt format**

In [16]:
import os
import pandas as pd

# Input and output folder paths
processed_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\3_processed_csv"  # Folder containing processed CSV files
output_folder = r"D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\4_word_txt"  # Folder to save word-specific TXT files
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# List of target words
words = ["bat", "cup", "drop", "eat", "fish", "hot", "jump", "milk", "pen", "red"]

# Iterate through all processed CSV files
for file_name in os.listdir(processed_folder):
    if file_name.endswith(".csv"):
        file_path = os.path.join(processed_folder, file_name)
        data = pd.read_csv(file_path, encoding="utf-8")

        # Ensure the CSV has the required structure
        if len(data.columns) < 2:
            print(f"File {file_name} does not have enough columns. Skipping...")
            continue

        # Extract the Word and Sentence/Question columns
        word_column = data.columns[0]  # First column is Word
        sentence_column = data.columns[1]  # Second column is Sentence/Question

        # Process each word and write to its respective TXT
        for word in words:
            # Filter rows where the Word column matches the current word
            word_rows = data[data[word_column] == word]

            if not word_rows.empty:
                # Prepare the content for the word TXT
                content = [sentence.strip() for sentence in word_rows[sentence_column]]

                # Write to the word-specific TXT file without extra spaces
                output_file = os.path.join(output_folder, f"{word}.txt")
                with open(output_file, "a", encoding="utf-8") as f:
                    f.write("\n".join(content) + "\n")  # Single newline between sentences

print(f"Processing completed. Word-specific files are saved in {output_folder}.")

Processing completed. Word-specific files are saved in D:\PycharmProjects\pro_dis_2\data_4_txt_gen\data_2nd\4_word_txt.
