In [23]:
path_org='../SentenceGeneration/Data/DebiasingCorpus/corpus(10-40)10k.txt'

In [24]:
path_sim='../Notebook/output_generated_sentences_100_low_ttr.txt'

In [25]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Downloading necessary NLTK data

def calculate_total_tokens_and_ngrams(sentences):
    total_tokens = 0
    unique_tokens = set()

    for sentence in sentences:
        # Tokenizing the sentence
        tokens = word_tokenize(sentence)
        total_tokens += len(tokens)

        # Updating the set of unique tokens
        unique_tokens.update(tokens)

    total_unique_tokens = len(unique_tokens)

    return total_tokens/len(sentences), total_unique_tokens/len(sentences)





[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [26]:
same_sentences = []
with open(path_sim, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            same_sentences.append(sentence)


In [27]:
same_sentences

["As tensions between the us and israel continue to escalate, the trump administration's red line was crossed when a white house spokesman issued a rare rebuke of netanyahu, signaling a significant shift in the longstanding us-israel relationship and raising",
 "In California, for instance, Governor Gavin Newsom, under the guidance of Lenny Mendonca, is diligently crafting a far-reaching strategy to upskill the state's diverse workforce, leveraging",
 "Inspite of the numerous workshops for women's empowerment and the successful organization of small-scale local elections across the country, the lack of media coverage and public awareness about these achievements hinders the progress of women's rights and inclusion in the country, highlighting the need for increased attention and support to amplify their voices and advance their causes.",
 "As Trump's next summit with Kim approaches, the fate of US alliances remains uncertain, with his previous actions and statements causing a ripple ef

In [28]:
org_sentences = []
with open(path_org, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            org_sentences.append(sentence)

In [29]:
org_sentences=org_sentences[:100]
n=1

## Checking number of tokens

In [30]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(same_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")
print(f"Total number of grams in the tokens: {total_bigrams}")

Average number of tokens per sentence: 45.06
Total number of grams in the tokens: 15.72


In [31]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(org_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")
print(f"Total number of {n}-grams in the tokens: {total_bigrams}")

Average number of tokens per sentence: 29.25
Total number of 1-grams in the tokens: 12.22


In [32]:
len(org_sentences)

100

## Checking similarity

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer()
# Calculate similarity for each pair of sentences
for i in range(min(len(org_sentences), len(same_sentences))):
    # Vectorize the sentences
    tfidf_matrix = vectorizer.fit_transform([org_sentences[i], same_sentences[i]])

    # Calculate the cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    print(f"Similarity of sentence {i+1}: {similarity[0][0]}")

Similarity of sentence 1: 0.36378067186360213
Similarity of sentence 2: 0.5025209075790544
Similarity of sentence 3: 0.5327121477469193
Similarity of sentence 4: 0.2652683444185885
Similarity of sentence 5: 0.2373492842288217
Similarity of sentence 6: 0.43066472322486066
Similarity of sentence 7: 0.6511123062449015
Similarity of sentence 8: 0.44229733258919157
Similarity of sentence 9: 0.30560325888183676
Similarity of sentence 10: 0.4121839692900302
Similarity of sentence 11: 0.2676848971630664
Similarity of sentence 12: 0.1431016077791778
Similarity of sentence 13: 0.13896490935438832
Similarity of sentence 14: 0.26105054104772935
Similarity of sentence 15: 0.38227082539090185
Similarity of sentence 16: 0.29767064094349927
Similarity of sentence 17: 0.23757791641454193
Similarity of sentence 18: 0.6805509438502384
Similarity of sentence 19: 0.17499585765336906
Similarity of sentence 20: 0.3799261917369016
Similarity of sentence 21: 0.11991931250434358
Similarity of sentence 22: 0.286

## Checking text token ratio

In [34]:
def calculate_ttr(sentences_list):
    def get_word_tokens(sentences):
        tokens = []
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split()
            tokens.extend(words)
        return tokens

    # Get word tokens from the given list
    tokens = get_word_tokens(sentences_list)

    # Calculate the number of types (unique words)
    types = len(set(tokens))

    # Calculate the number of tokens (total words)
    tokens_total = len(tokens)

    # Calculate the type-token ratio (TTR)
    ttr = types / tokens_total if tokens_total > 0 else 0

    return ttr

In [35]:
ratio_list1 = calculate_ttr(org_sentences)
ratio_list2 = calculate_ttr(same_sentences)

print(f"Text-to-token ratio for list1: {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2: {ratio_list2:.2f}")


Text-to-token ratio for list1: 0.51
Text-to-token ratio for list2: 0.41


In [123]:
def calculate_hapax_legomenon_ratio(sentences_list):
    def get_word_tokens(sentences):
        tokens = []
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split()
            tokens.extend(words)
        return tokens

    # Get word tokens from the given list
    tokens = get_word_tokens(sentences_list)

    # Calculate the number of hapax legomena (words occurring only once)
    hapax_legomena = [word for word in set(tokens) if tokens.count(word) == 1]

    # Calculate the hapax legomenon ratio (HLR)
    hlr = len(hapax_legomena) / len(tokens) if len(tokens) > 0 else 0

    return hlr


In [124]:
ratio_list1 = calculate_hapax_legomenon_ratio(org_sentences)
ratio_list2 = calculate_hapax_legomenon_ratio(same_sentences)

print(f"Text-to-token ratio for list1: {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2: {ratio_list2:.2f}")


Text-to-token ratio for list1: 0.61
Text-to-token ratio for list2: 0.56


## Readability

In [125]:
import textstat

def calculate_readability_scores(sentences):
    # Join the list of sentences into a single text
    text = ' '.join(sentences)

    # Calculate readability scores
    flesch_kincaid = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)

    return flesch_kincaid, gunning_fog


In [126]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(same_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")


Flesch-Kincaid Grade Level: 31.2
Gunning Fog Index: 34.89


In [127]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(org_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")

Flesch-Kincaid Grade Level: 12.9
Gunning Fog Index: 15.56
