In [22]:
path_org='../SentenceGeneration/Data/DebiasingCorpus/CDA/corpus_10-40_20k.txt'

In [23]:
path_sim='../SentenceGeneration/Data/DebiasingCorpus/Original/corpus_10-40_20k.txt'

In [24]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Downloading necessary NLTK data

def calculate_total_tokens_and_ngrams(sentences):
    total_tokens = 0
    unique_tokens = set()

    for sentence in sentences:
        # Tokenizing the sentence
        tokens = word_tokenize(sentence)
        total_tokens += len(tokens)

        # Updating the set of unique tokens
        unique_tokens.update(tokens)

    total_unique_tokens = len(unique_tokens)

    return total_tokens/len(sentences), total_unique_tokens/len(sentences)





[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bidhanbashyal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
same_sentences = []
with open(path_sim, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            same_sentences.append(sentence)


In [26]:
org_sentences = []
with open(path_org, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            org_sentences.append(sentence)

In [27]:
same_sentences[:2]

['the trump administration’s red line was apparently crossed this month, when a white house spokesman issued a rare rebuke of netanyahu, dismissing reports that us officials had discussed an annexation plan for the west bank with their israeli counterparts.',
 'the trump administration ’ s red line was apparently crossed this month , when a white house spokeswoman issued a rare rebuke of netanyahu , dismissing reports that us officials had discussed an annexation plan for the west bank with their israeli counterparts .']

In [28]:
org_sentences[:2]

["The trump administration's red line was crossed when a white house spokesman issued a rare rebuke of netanyahu, dismissing reports of annexation plan for the west bank with their israeli counterparts, and this move was seen as a significant escalation in the ongoing tensions between the two nations.",
 "The trump administration 's red line was crossed when a white house spokeswoman issued a rare rebuke of netanyahu , dismissing reports of annexation plan for the west bank with their israeli counterparts , and this move was seen as a significant escalation in the ongoing tensions between the two nations ."]

## Checking number of tokens

In [29]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(same_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")

Average number of tokens per sentence: 28.362


In [30]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(org_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")

Average number of tokens per sentence: 52.6546


In [31]:
len(same_sentences)

20000

In [32]:
len(org_sentences)

20000

## Checking similarity

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

l=[]
vectorizer = TfidfVectorizer()

total_similarity = 0
for i in range(min(len(org_sentences), len(same_sentences))):
    tfidf_matrix = vectorizer.fit_transform([org_sentences[i], same_sentences[i]])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    #print(f"Similarity of sentence {i+1}: {similarity[0][0]}")
    total_similarity += similarity[0][0]

average_similarity = total_similarity / min(len(org_sentences), len(same_sentences))
print(f"Average Similarity: {average_similarity}")

Average Similarity: 0.3234522517869306


In [20]:
len(l)

26

In [21]:
same_sentences[9454]

'Moreover, this flawed mindset overlooks the fact that a marriage should be built on mutual respect, trust, and equality, rather than on the idea that one partner has the right to control or dominate the other. By recognizing and challenging such harmful beliefs, we can work towards creating a safer and more supportive environment for all individuals in abusive relationships.'

she claims she will not stop until your family is dead.

## Checking text token ratio

In [29]:
def calculate_ttr(sentences_list):
    def get_word_tokens(sentences):
        tokens = []
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split()
            tokens.extend(words)
        return tokens
    print(len(sentences_list))

    # Get word tokens from the given list
    tokens = get_word_tokens(sentences_list)

    # Calculate the number of types (unique words)
    types = len(set(tokens))

    # Calculate the number of tokens (total words)
    tokens_total = len(tokens)

    # Calculate the type-token ratio (TTR)
    ttr = types / tokens_total if tokens_total > 0 else 0

    return ttr

In [30]:
ratio_list1 = calculate_ttr(org_sentences)
ratio_list2 = calculate_ttr(same_sentences)

print(f"Text-to-token ratio for list1: {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2: {ratio_list2:.2f}")


10000
10000
Text-to-token ratio for list1: 0.13
Text-to-token ratio for list2: 0.06


In [24]:
def calculate_hapax_legomenon_ratio(sentences_list):
    def get_word_tokens(sentences):
        tokens = []
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split()
            tokens.extend(words)
        return tokens

    # Get word tokens from the given list
    tokens = get_word_tokens(sentences_list)

    # Calculate the number of hapax legomena (words occurring only once)
    hapax_legomena = [word for word in set(tokens) if tokens.count(word) == 1]

    # Calculate the hapax legomenon ratio (HLR)
    hlr = len(hapax_legomena) / len(tokens) if len(tokens) > 0 else 0

    return hlr


In [25]:
ratio_list1 = calculate_hapax_legomenon_ratio(org_sentences)
ratio_list2 = calculate_hapax_legomenon_ratio(same_sentences)

print(f"Text-to-token ratio for list1: {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2: {ratio_list2:.2f}")


KeyboardInterrupt: 

## Readability

In [26]:
import textstat

def calculate_readability_scores(sentences):
    # Join the list of sentences into a single text
    text = ' '.join(sentences)

    # Calculate readability scores
    flesch_kincaid = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)

    return flesch_kincaid, gunning_fog


In [27]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(same_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")


Flesch-Kincaid Grade Level: 23.4
Gunning Fog Index: 18.82


In [28]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(org_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")

Flesch-Kincaid Grade Level: 12.5
Gunning Fog Index: 10.44
