In [2]:
path_org='../SentenceGeneration/Data/DebiasingCorpus/Original/corpus_1-13_20k.txt'

In [3]:
path_sim='../SentenceGeneration/Data/DebiasingCorpus/CDA/corpus_1-13_20k.txt'

In [6]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Downloading necessary NLTK data

def calculate_total_tokens_and_ngrams(sentences):
    total_tokens = 0
    unique_tokens = set()

    for sentence in sentences:
        # Tokenizing the sentence
        tokens = word_tokenize(sentence)
        total_tokens += len(tokens)

        # Updating the set of unique tokens
        unique_tokens.update(tokens)

    total_unique_tokens = len(unique_tokens)

    return total_tokens/len(sentences), total_unique_tokens/len(sentences)





[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bidhanbashyal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
same_sentences = []
with open(path_sim, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            same_sentences.append(sentence)


In [8]:
org_sentences = []
with open(path_org, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip to remove any leading/trailing whitespace
        sentence = line.strip()
        # Only add non-empty sentences
        if sentence:
            org_sentences.append(sentence)

In [9]:
same_sentences[:2]

["Saif's lack of follow-up left me feeling frustrated and neglected, and I was left to wonder if he was truly invested in the project or if he had simply lost interest.",
 "Saif 's lack of follow-up left me feeling frustrated and neglected , and I was left to wonder if she was truly invested in the project or if she had simply lost interest ."]

In [10]:
org_sentences[:2]

['saif never followed up; nor did i.', 'saif never followed up ; nor did i .']

## Checking number of tokens

In [11]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(same_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")

Average number of tokens per sentence: 44.5568


In [12]:
average_tokens, total_bigrams =  calculate_total_tokens_and_ngrams(org_sentences)
print(f"Average number of tokens per sentence: {average_tokens}")

Average number of tokens per sentence: 11.65025


In [13]:
len(same_sentences)

20000

In [14]:
len(org_sentences)

20000

## Checking similarity

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

l=[]
vectorizer = TfidfVectorizer()

total_similarity = 0
for i in range(min(len(org_sentences), len(same_sentences))):
    tfidf_matrix = vectorizer.fit_transform([org_sentences[i], same_sentences[i]])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    print(f"Similarity of sentence {i+1}: {similarity[0][0]}")
    total_similarity += similarity[0][0]

average_similarity = total_similarity / min(len(org_sentences), len(same_sentences))
print(f"Average Similarity: {average_similarity}")

Similarity of sentence 1: 0.0714933109839
Similarity of sentence 2: 0.0714933109839
Similarity of sentence 3: 0.04247934487912242
Similarity of sentence 4: 0.04247934487912242
Similarity of sentence 5: 0.16439003964172438
Similarity of sentence 6: 0.16439003964172438
Similarity of sentence 7: 0.2571561571248326
Similarity of sentence 8: 0.2571561571248326
Similarity of sentence 9: 0.45062382369775605
Similarity of sentence 10: 0.45062382369775605
Similarity of sentence 11: 0.18435554192630063
Similarity of sentence 12: 0.18435554192630063
Similarity of sentence 13: 0.11726076605993337
Similarity of sentence 14: 0.11726076605993337
Similarity of sentence 15: 0.05040125896780564
Similarity of sentence 16: 0.05040125896780564
Similarity of sentence 17: 0.3232354058650929
Similarity of sentence 18: 0.3232354058650929
Similarity of sentence 19: 0.25557297390468403
Similarity of sentence 20: 0.25557297390468403
Similarity of sentence 21: 0.07980011014729993
Similarity of sentence 22: 0.07980

ValueError: empty vocabulary; perhaps the documents only contain stop words

## Checking text token ratio

In [45]:
def calculate_ttr(sentences_list):
    ttr_per_sentence = []
    types_=[]
    for sentence in sentences_list:
        tokens = sentence.split()
        types = len(set(tokens))
        tokens_total = len(tokens)
        ttr = types / tokens_total if tokens_total > 0 else 0
        ttr_per_sentence.append(ttr)
        types_.append(tokens_total)
    average_ttr = sum(ttr_per_sentence) / len(ttr_per_sentence) if len(ttr_per_sentence) > 0 else 0
    avg_types=sum(types_)/len(types_)
    print(avg_types)
    final_score=average_ttr-(1/avg_types)
    return ttr_per_sentence, average_ttr,final_score



In [46]:
_,ratio_list1,f1 = calculate_ttr(org_sentences)
_,ratio_list2,f2 = calculate_ttr(same_sentences)

print(f"Text-to-token ratio for list1(Original Sentence): {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2(generated Sentence): {ratio_list2:.2f}")
print(f1)
print(f2)

10.65945
42.34315
Text-to-token ratio for list1(Original Sentence): 0.98
Text-to-token ratio for list2(generated Sentence): 0.82
0.8842110461558006
0.799501837633001


In [None]:
f1

In [18]:
def calculate_hapax_legomenon_ratio(sentences_list):
    def get_word_tokens(sentences):
        tokens = []
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split()
            tokens.extend(words)
        return tokens

    # Get word tokens from the given list
    tokens = get_word_tokens(sentences_list)

    # Calculate the number of hapax legomena (words occurring only once)
    hapax_legomena = [word for word in set(tokens) if tokens.count(word) == 1]

    # Calculate the hapax legomenon ratio (HLR)
    hlr = len(hapax_legomena) / len(tokens) if len(tokens) > 0 else 0

    return hlr


In [19]:
ratio_list1 = calculate_hapax_legomenon_ratio(org_sentences)
ratio_list2 = calculate_hapax_legomenon_ratio(same_sentences)

print(f"Text-to-token ratio for list1: {ratio_list1:.2f}")
print(f"Text-to-token ratio for list2: {ratio_list2:.2f}")


KeyboardInterrupt: 

## Readability

In [20]:
import textstat

def calculate_readability_scores(sentences):
    # Join the list of sentences into a single text
    text = ' '.join(sentences)

    # Calculate readability scores
    flesch_kincaid = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)

    return flesch_kincaid, gunning_fog


In [21]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(same_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")


Flesch-Kincaid Grade Level: 19.4
Gunning Fog Index: 15.61


In [22]:
flesch_kincaid_score, gunning_fog_score = calculate_readability_scores(org_sentences)

print(f"Flesch-Kincaid Grade Level: {flesch_kincaid_score}")
print(f"Gunning Fog Index: {gunning_fog_score}")

Flesch-Kincaid Grade Level: 6.0
Gunning Fog Index: 4.61
