In [128]:
path_org='/Users/bidhanbashyal/MSU/Research/DataAug4SocialBias/SentenceGeneration/Data/DebiasingCorpus/corpus(10-40)10k.txt'

In [129]:
path_sim='/Users/bidhanbashyal/MSU/Research/DataAug4SocialBias/Notebook/output_generated_sentences_10_better.txt'

In [130]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Downloading necessary NLTK data

def calculate_total_tokens_and_ngrams(sentences):
    total_tokens = 0
    unique_tokens = set()

    for sentence in sentences:
        # Tokenizing the sentence
        tokens = word_tokenize(sentence)
        total_tokens += len(tokens)

        # Updating the set of unique tokens
        unique_tokens.update(tokens)

    total_unique_tokens = len(unique_tokens)

    return total_tokens/len(sentences), total_unique_tokens/len(sentences)





[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bidhanbashyal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [131]:
same_sentences = []
with open(path_sim, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            same_sentences.append(sentence)


In [132]:
same_sentences

['As the US and Israel navigate their intricate relationship, the recent incident involving the White House spokesman and Netanyahu has introduced a new layer of complexity, prompting both nations to reassess their stance',
 'With the visionary leadership of Governor Gavin Newsom and the guidance of Lenny Mendonca, California is embarking on a groundbreaking initiative to upskill its diverse workforce, empowering them to seize the opportunities presented by the rapidly evolving technological landscape, thereby fostering a future',
 'Despite the scarcity of information about workshops for women’s empowerment and the successful organization of small-scale local elections across the country, there is a pressing need to bridge this knowledge gap and provide a comprehensive understanding of these initiatives, enabling a more informed and inclusive decision-making process.',
 "Moreover, the looming summit between Trump and Kim casts a shadow of uncertainty over the fate of the US-Korea allia

In [133]:
org_sentences = []
with open(path_org, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            org_sentences.append(sentence)

In [134]:
org_sentences=org_sentences[:10]
n=1

## Checking number of tokens

In [117]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(same_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")
print(f"Total number of grams in the tokens: {total_bigrams}")

Average number of tokens per sentence: 45.2
Total number of grams in the tokens: 26.1


In [118]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(org_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")
print(f"Total number of {n}-grams in the tokens: {total_bigrams}")

Average number of tokens per sentence: 32.1
Total number of 1-grams in the tokens: 20.3


In [119]:
len(org_sentences)

10

## Checking similarity

In [120]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer()
# Calculate similarity for each pair of sentences
for i in range(min(len(org_sentences), len(same_sentences))):
    # Vectorize the sentences
    tfidf_matrix = vectorizer.fit_transform([org_sentences[i], same_sentences[i]])

    # Calculate the cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    print(f"Similarity of sentence {i+1}: {similarity[0][0]}")

Similarity of sentence 1: 0.21418440628898394
Similarity of sentence 2: 0.2945828772607368
Similarity of sentence 3: 0.45855938512288874
Similarity of sentence 4: 0.07495311385871599
Similarity of sentence 5: 0.37985104096812444
Similarity of sentence 6: 0.21433422603336758
Similarity of sentence 7: 0.1318488807660832
Similarity of sentence 8: 0.45358262867878574
Similarity of sentence 9: 0.412034947426482
Similarity of sentence 10: 0.18205213126219905


## Checking text token ratio

In [121]:
def calculate_ttr(sentences_list):
    def get_word_tokens(sentences):
        tokens = []
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split()
            tokens.extend(words)
        return tokens

    # Get word tokens from the given list
    tokens = get_word_tokens(sentences_list)

    # Calculate the number of types (unique words)
    types = len(set(tokens))

    # Calculate the number of tokens (total words)
    tokens_total = len(tokens)

    # Calculate the type-token ratio (TTR)
    ttr = types / tokens_total if tokens_total > 0 else 0

    return ttr

In [122]:
ratio_list1 = calculate_ttr(org_sentences)
ratio_list2 = calculate_ttr(same_sentences)

print(f"Text-to-token ratio for list1: {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2: {ratio_list2:.2f}")


Text-to-token ratio for list1: 0.71
Text-to-token ratio for list2: 0.65


In [123]:
def calculate_hapax_legomenon_ratio(sentences_list):
    def get_word_tokens(sentences):
        tokens = []
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split()
            tokens.extend(words)
        return tokens

    # Get word tokens from the given list
    tokens = get_word_tokens(sentences_list)

    # Calculate the number of hapax legomena (words occurring only once)
    hapax_legomena = [word for word in set(tokens) if tokens.count(word) == 1]

    # Calculate the hapax legomenon ratio (HLR)
    hlr = len(hapax_legomena) / len(tokens) if len(tokens) > 0 else 0

    return hlr


In [124]:
ratio_list1 = calculate_hapax_legomenon_ratio(org_sentences)
ratio_list2 = calculate_hapax_legomenon_ratio(same_sentences)

print(f"Text-to-token ratio for list1: {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2: {ratio_list2:.2f}")


Text-to-token ratio for list1: 0.61
Text-to-token ratio for list2: 0.56


## Readability

In [125]:
import textstat

def calculate_readability_scores(sentences):
    # Join the list of sentences into a single text
    text = ' '.join(sentences)

    # Calculate readability scores
    flesch_kincaid = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)

    return flesch_kincaid, gunning_fog


In [126]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(same_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")


Flesch-Kincaid Grade Level: 31.2
Gunning Fog Index: 34.89


In [127]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(org_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")

Flesch-Kincaid Grade Level: 12.9
Gunning Fog Index: 15.56
