In [24]:
import nltk.data
import re
from statistics import mode

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
def extract_sentences(input_file, output_file, keywords):
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into sentences
    sentences = tokenizer.tokenize(content)

    # Prepare the regex pattern for whole word match
    pattern = r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b'

    # Filter sentences containing any of the keywords as whole words
    filtered_sentences = [sentence for sentence in sentences if re.search(pattern, sentence, re.IGNORECASE)]

    # Write the filtered sentences to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        for sentence in filtered_sentences:
            file.write(sentence + "\n")
    return filtered_sentences


In [11]:
file1='/Users/bidhanbashyal/MSU/Research/DataAug4SocialBias/SentenceGeneration/Data/gender_wordlist/man_word_list.txt'
file2='/Users/bidhanbashyal/MSU/Research/DataAug4SocialBias/SentenceGeneration/Data/gender_wordlist/woman_word_list.txt'

In [13]:
def load_keywords_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keywords = [line.strip() for line in file.readlines()]
    return keywords

# Load gender keywords from two files
file1_keywords = load_keywords_from_file(file1)
file2_keywords = load_keywords_from_file(file2)

In [14]:
gender_keywords =file1_keywords + file2_keywords

In [18]:
se=extract_sentences('/Users/bidhanbashyal/MSU/Research/DataAug4SocialBias/Sanitycheck/Data/Debiasing_data/debiasing.train.txt', 'gender_output.txt', gender_keywords)

In [19]:
import re

def clean_and_save_sentences(file_path, output_file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Splitting the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Removing extra spaces and line breaks within sentences
    sentences = [sentence.strip().replace('\n', ' ').replace('\r', '') for sentence in sentences]

    # Writing cleaned sentences to a new file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for sentence in sentences:
            output_file.write(sentence + '\n')

    return len(sentences)


In [20]:

# Replace these with your actual file paths
file_path = 'gender_output.txt'
output_file_path = 'gender_output_normalized.txt'

sentence_count = clean_and_save_sentences(file_path, output_file_path)
print(f"Number of cleaned sentences: {sentence_count}")




Number of cleaned sentences: 94265


In [25]:
def calculate_sentence_stats(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into sentences using the initialized tokenizer
    sentences = tokenizer.tokenize(content)

    # Calculate sentence lengths
    sentence_lengths = [len(sentence.split()) for sentence in sentences]

    # Calculate statistics
    total_sentences = len(sentences)
    max_sentence_length = max(sentence_lengths)
    average_sentence_length = sum(sentence_lengths) / total_sentences
    mode_sentence_length = mode(sentence_lengths)

    return total_sentences, max_sentence_length, average_sentence_length, mode_sentence_length

# Replace 'your_corpus.txt' with the actual path to your corpus file
corpus_stats = calculate_sentence_stats('gender_output_normalized.txt')

print(f"Total sentences: {corpus_stats[0]}")
print(f"Maximum sentence length: {corpus_stats[1]}")
print(f"Average sentence length: {corpus_stats[2]:.2f}")
print(f"Mode of sentence length: {corpus_stats[3]}")


Total sentences: 90655
Maximum sentence length: 343
Average sentence length: 28.62
Mode of sentence length: 24


In [20]:
def filter_and_save_sentences_by_length(corpus_path, min_length, max_length, output_file):
    with open(corpus_path, 'r', encoding='utf-8') as file:
        corpus = file.readlines()

    filtered_sentences = [sentence.strip() for sentence in corpus if min_length <= len(sentence.split()) <= max_length]

    # Save the filtered sentences to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        for sentence in filtered_sentences:
            file.write(sentence + '\n')

    return len(filtered_sentences)


In [23]:
filtered_sentences_count = filter_and_save_sentences_by_length('../SentenceGeneration/Data/DebiasingCorpus/gender_output_normalized.txt', 40,100,'gender(40-100).txt')
print(f"Number of filtered sentences: {filtered_sentences_count}")

Number of filtered sentences: 14905


In [17]:
import random

def get_random_sentences(corpus_path, sample_size, output_file):
    with open(corpus_path, 'r', encoding='utf-8') as file:
        corpus = file.readlines()

    # Ensure that the sample size is not greater than the total number of sentences
    sample_size = min(sample_size, len(corpus))

    # Randomly select sample_size sentences
    random_sentences = random.sample(corpus, sample_size)

    # Save the randomly selected sentences to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        for sentence in random_sentences:
            file.write(sentence)

    return random_sentences


In [25]:
corpus_path = '../SentenceGeneration/Data/DebiasingCorpus/gender(40-100).txt'  # Replace with the actual path to your corpus file
sample_size = 10000
output_file = 'corpus(40-100)10k.txt'

random_sentences = get_random_sentences(corpus_path, sample_size, output_file)
print(f"Number of randomly selected sentences: {len(random_sentences)}")


Number of randomly selected sentences: 10000
