In [13]:
import os
import pandas as pd
import nltk
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import string

In [14]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Analysis of Readability

All the related fields such as complex words,syllable count etc are calculated in the functions of format "func_name_1"

In [31]:
def count_syllables_1(word):
    vowels = 'aeiouy'
    count = 0

    for letter in word:
        if letter.lower() in vowels:
            count += 1

    if count > 1 and word.endswith(('es', 'ed')):
        count -= 1

    return count


In [32]:
def calculate_metrics_1(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)

    num_words = len(words)
    num_sentences = len(sentences)

    complex_words = [word for word in words if count_syllables_1(word) > 2]
    num_complex_words = len(complex_words)

    syllable_count_per_word = sum(count_syllables_1(word) for word in words) / num_words

    average_sentence_length = num_words / num_sentences
    percentage_complex_words = num_complex_words / num_words * 100
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    return {
        'Average Sentence Length': average_sentence_length,
        'Percentage of Complex Words': percentage_complex_words,
        'Fog Index': fog_index,
        'Count of Complex Words': num_complex_words,
        'Syllable Count Per Word': syllable_count_per_word
    }

In [33]:
def analyze_text_files_1(folder_path):
    result_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                metrics = calculate_metrics_1(text)
                result_data.append({'File': file_name, **metrics})

    result_df = pd.DataFrame(result_data)
    return result_df

Personal Pronoun Counts,Average Word Length calculated in functions of format "func_name_2"

In [50]:
def count_personal_pronouns_2(text):
    personal_pronouns = ["I", "we", "my", "ours", "us","We","My","Ours","Us"]
    pronoun_counts = {pronoun: text.count(pronoun) for pronoun in personal_pronouns}
    pronoun_counts["us"] -= text.lower().count("us")
    
    return pronoun_counts


In [51]:
def calculate_metrics_2(text):
    words = word_tokenize(text)

    personal_pronoun_counts = count_personal_pronouns_2(text)

    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    average_word_length = total_characters / total_words if total_words > 0 else 0

    
    
    return {
        'Personal Pronoun Counts': personal_pronoun_counts,
        'Average Word Length': average_word_length
    }


In [52]:
def analyze_text_files_2(folder_path):
    result_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                metrics = calculate_metrics_2(text)
                result_data.append({'File': file_name, **metrics})

    result_df = pd.DataFrame(result_data)
    
    
    return result_df

Calculating clean words

In [54]:
def calculate_cleaned_word_count_3(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    cleaned_words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]
    word_count = len(cleaned_words)

    return word_count

In [46]:


def analyze_text_files_3(folder_path):
    result_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                word_count = calculate_cleaned_word_count_3(text)
                
                result_data.append({'File': file_name, 'Cleaned Word Count': word_count})

    result_df = pd.DataFrame(result_data)
    return result_df



In [39]:
folder_path = 'output'

In [40]:
result_dataframe_1 = analyze_text_files_1(folder_path)
result_dataframe_1

Unnamed: 0,File,Average Sentence Length,Percentage of Complex Words,Fog Index,Count of Complex Words,Syllable Count Per Word
0,blackassign_blackassign0001.txt,22.323944,20.694006,17.207180,328,1.663722
1,blackassign_blackassign0002.txt,23.728395,27.627471,20.542347,531,1.855359
2,blackassign_blackassign0003.txt,24.032787,32.810368,22.737262,481,1.995225
3,blackassign_blackassign0004.txt,26.035714,31.618656,23.061748,461,1.955418
4,blackassign_blackassign0005.txt,22.977273,27.299703,20.110790,276,1.838773
...,...,...,...,...,...,...
93,blackassign_blackassign0096.txt,27.018519,25.222755,20.896510,368,1.823852
94,blackassign_blackassign0097.txt,33.767442,21.556474,22.129566,313,1.645317
95,blackassign_blackassign0098.txt,46.777778,27.315914,29.637477,115,1.698337
96,blackassign_blackassign0099.txt,25.848485,23.329426,19.671164,199,1.691676


In [53]:
result_dataframe_2 = analyze_text_files_2(folder_path)
result_dataframe_2

Unnamed: 0,File,Personal Pronoun Counts,Average Word Length
0,blackassign_blackassign0001.txt,"{'I': 78, 'we': 10, 'my': 10, 'ours': 0, 'us':...",4.361514
1,blackassign_blackassign0002.txt,"{'I': 41, 'we': 18, 'my': 6, 'ours': 0, 'us': ...",4.907388
2,blackassign_blackassign0003.txt,"{'I': 23, 'we': 22, 'my': 2, 'ours': 1, 'us': ...",5.352660
3,blackassign_blackassign0004.txt,"{'I': 19, 'we': 9, 'my': 5, 'ours': 1, 'us': 0...",5.204390
4,blackassign_blackassign0005.txt,"{'I': 17, 'we': 10, 'my': 2, 'ours': 0, 'us': ...",4.974283
...,...,...,...
93,blackassign_blackassign0096.txt,"{'I': 21, 'we': 12, 'my': 5, 'ours': 0, 'us': ...",4.904044
94,blackassign_blackassign0097.txt,"{'I': 18, 'we': 16, 'my': 3, 'ours': 0, 'us': ...",4.369835
95,blackassign_blackassign0098.txt,"{'I': 15, 'we': 2, 'my': 4, 'ours': 0, 'us': 0...",4.684086
96,blackassign_blackassign0099.txt,"{'I': 27, 'we': 5, 'my': 2, 'ours': 0, 'us': 0...",4.452521


In [56]:
result_dataframe_3 = analyze_text_files_3(folder_path)
result_dataframe_3

Unnamed: 0,File,Cleaned Word Count
0,blackassign_blackassign0001.txt,794
1,blackassign_blackassign0002.txt,1044
2,blackassign_blackassign0003.txt,826
3,blackassign_blackassign0004.txt,825
4,blackassign_blackassign0005.txt,566
...,...,...
93,blackassign_blackassign0096.txt,803
94,blackassign_blackassign0097.txt,684
95,blackassign_blackassign0098.txt,249
96,blackassign_blackassign0099.txt,466


In [58]:

common_column = 'File'
merged_df = result_dataframe_1.merge(result_dataframe_2, on=common_column, how='outer')
final_merged_df = merged_df.merge(result_dataframe_3, on=common_column, how='outer')

# Assuming final_merged_df is your DataFrame
final_merged_df.to_csv('file_2.csv', index=False)

