In [None]:
import os
import nltk 
import pandas as pd 
from nltk.tokenize import word_tokenize
import re

In [None]:
nltk.download('punkt')

In [None]:
csv_file_path = 'OutputDataStructure.csv'
df = pd.read_csv(csv_file_path)


Word count for each article

In [None]:
def count_words_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = text.split()
        return len(words)

In [None]:
def count_words_in_directory(directory_path):
    file_word_counts = {}       # Dictionary is created
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            word_count = count_words_in_file(file_path)
            file_word_counts[filename] = word_count     # Key : Value pair
    return file_word_counts

In [None]:
directory_path = 'cleaned_output'
file_word_counts = count_words_in_directory(directory_path)

for filename, word_count in file_word_counts.items():       # To print dictionary items i.e key : value pair
    print(f'{filename}: {word_count} words')

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_word_counts:
        df.loc[index, 'WORD COUNT'] = file_word_counts[filename]

df.to_csv(csv_file_path, index=False)

For output dir

In [None]:
directory_path = 'output'
file_word_counts = count_words_in_directory(directory_path)

for filename, total_words in file_word_counts.items():
    print(f'{filename}: {total_words} words')

Average word length

In [None]:
def calculate_average_word_length(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = text.split()
        total_word_length = sum(len(word) for word in words)
        total_words = len(words)
        average_word_length = total_word_length / total_words if total_words > 0 else 0
        return average_word_length

In [None]:
def process_files_in_directory(directory_path):
    file_averages = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            average_word_length = calculate_average_word_length(file_path)
            file_averages[filename] = average_word_length
    return file_averages

In [None]:
directory_path = 'cleaned_output' 
file_averages = process_files_in_directory(directory_path)

for filename, average_word_length in file_averages.items():
    print(f"{filename}: average word length: {average_word_length:.2f}")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt"  
    if filename in file_averages:
        df.loc[index, 'AVG WORD LENGTH'] = file_averages[filename]

df.to_csv(csv_file_path, index=False)

Total sentences for each file

In [None]:
def count_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
        sentences = nltk.sent_tokenize(text)
        return len(sentences)

In [None]:
def count_sentences_in_directory(directory_path):
    sentences_per_file = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            num_sentences = count_sentences(file_path)
            sentences_per_file[filename] = num_sentences
    return sentences_per_file



In [None]:
directory_path = 'output'
sentences_per_file = count_sentences_in_directory(directory_path)

for filename, num_sentences in sentences_per_file.items():
    print(f'{filename}: Number of Sentences: {num_sentences}')


Load directories

In [None]:
cleaned_text_dir = 'cleaned_output'
master_dict_dir = 'MasterDictionary'
extracted_text_dir = 'output'
positive_words_file = os.path.join(master_dict_dir, 'positive-words.txt')
negative_words_file = os.path.join(master_dict_dir, 'negative-words.txt')

In [None]:
def load_words(file_path):
    words = set()
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for line in file:
            words.add(line.strip().lower())
    return words

In [None]:
positive_words = load_words(positive_words_file)
negative_words = load_words(negative_words_file)

In [None]:
print(f"Total positive words: {len(positive_words)}")
print(f"Total negative words: {len(negative_words)}")

Tokenize the text

In [None]:
def calculate_sentiment(text, positive_words, negative_words):      # Can remove positive_words, negative_words from parameters.
    tokens = word_tokenize(text.lower())

Positive score

In [None]:
def calculate_positive_score(file_path, positive_words):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokens = word_tokenize(text.lower())
        
        positive_score = sum(1 for token in tokens if token in positive_words)
        
        return positive_score

In [None]:
def process_files_in_directory(directory_path, positive_words_file):
    file_positive_scores = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            positive_score = calculate_positive_score(file_path, positive_words)
            file_positive_scores[filename] = positive_score
    return file_positive_scores

In [None]:
directory_path = 'cleaned_output' 
positive_words_file = 'positive-words.txt'
file_positive_scores = process_files_in_directory(directory_path, positive_words_file)

for filename, positive_score in file_positive_scores.items():
    print(f"{filename}: Positive score = {positive_score}")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_positive_scores:
        df.loc[index, 'POSITIVE SCORE'] = file_positive_scores[filename]

df.to_csv(csv_file_path, index=False)

Negative score

In [None]:
def calculate_negative_score(file_path, negative_words):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokens = word_tokenize(text.lower())
        
        negative_score = sum(-1 for token in tokens if token in negative_words)
        
        return negative_score

In [None]:
def process_files_in_directory(directory_path, negative_words_file):
    file_negative_scores = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            negative_score = calculate_negative_score(file_path, negative_words)
            file_negative_scores[filename] = -1 * negative_score
    return file_negative_scores

In [None]:
directory_path = 'cleaned_output' 
negative_words_file = 'negative-words.txt' 
file_negative_scores = process_files_in_directory(directory_path, negative_words_file)

for filename, negative_score in file_negative_scores.items():
    print(f"{filename}: Negative score = {negative_score}")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_negative_scores:
        df.loc[index, 'NEGATIVE SCORE'] = file_negative_scores[filename]

df.to_csv(csv_file_path, index=False)

Polarity score

In [None]:
def load_words(word_dict_path):
    with open(word_dict_path, 'r', encoding='utf-8', errors='ignore') as file:
        words = set(word.strip() for word in file.readlines())
    return words

In [None]:
def calculate_scores(file_path, positive_words_, negative_words_):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read().lower() 
        words = re.findall(r'\b\w+\b', text)
        positive_score_= sum(1 for word in words if word in positive_words_)
        negative_score_ = sum(1 for word in words if word in negative_words_)
        return positive_score_, negative_score_

In [None]:
def calculate_polarity_score(positive_score_ , negative_score_):
    denominator = (positive_score_ + negative_score_) + 0.000001
    polarity_score = (positive_score_ - negative_score_) / denominator
    return polarity_score


In [None]:
def process_files_in_directory(directory_path, positive_dict_path, negative_dict_path):
    positive_words_ = load_words(positive_dict_path)
    negative_words_ = load_words(negative_dict_path)
    
    file_polarity_scores = {}
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            positive_score_, negative_score_ = calculate_scores(file_path, positive_words_, negative_words_)
            polarity_score = calculate_polarity_score(positive_score_, negative_score_)
            file_polarity_scores[filename] = polarity_score
    return file_polarity_scores

In [None]:
directory_path = 'cleaned_output'
positive_dict_path = r'MasterDictionary\positive-words.txt'  
negative_dict_path = r'MasterDictionary\negative-words.txt' 

file_polarity_scores = process_files_in_directory(directory_path, positive_dict_path, negative_dict_path)

for filename, polarity_score in file_polarity_scores.items():
    print(f"{filename}: Polarity score = {polarity_score:.2f}")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_polarity_scores:
        df.loc[index, 'POLARITY SCORE'] = file_polarity_scores[filename]

df.to_csv(csv_file_path, index=False)

Subjectivity score

In [None]:
def load_word_list(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        word_list = set(word.strip() for word in file.readlines())
    return word_list

In [None]:
def get_positive_score(file_path, positive_words):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read().lower()  
        words = re.findall(r'\b\w+\b', text) 
        positive_score = sum(1 for word in words if word in positive_words)
        return positive_score

def get_negative_score(file_path, negative_words):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read().lower() 
        words = re.findall(r'\b\w+\b', text) 
        negative_score = sum(1 for word in words if word in negative_words)
        return negative_score

def get_total_words(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
      
        words = re.findall(r'\b\w+\b', text) 
        total_words = len(words)
        return total_words

In [None]:
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    epsilon = 0.000001
    subjectivity_score = (positive_score + negative_score) / (total_words + epsilon)
    return subjectivity_score

In [None]:
def process_files_in_directory(directory_path, positive_dict_path, negative_dict_path):
    positive_words = load_word_list(positive_dict_path)
    negative_words = load_word_list(negative_dict_path)
    
    file_subjectivity_scores = {}
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            
            positive_score = get_positive_score(file_path, positive_words)
            negative_score = get_negative_score(file_path, negative_words)
            total_words = get_total_words(file_path)
            
            subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)
            
            file_subjectivity_scores[filename] = subjectivity_score
    
    return file_subjectivity_scores

In [None]:
directory_path = 'cleaned_output' 
positive_dict_path = r'MasterDictionary\positive-words.txt' 
negative_dict_path = r'MasterDictionary\negative-words.txt' 

file_subjectivity_scores = process_files_in_directory(directory_path, positive_dict_path, negative_dict_path)

for filename, subjectivity_score in file_subjectivity_scores.items():
    print(f"{filename}: Subjectivity Score = {subjectivity_score:.6f}")


In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_subjectivity_scores:
        df.loc[index, 'SUBJECTIVITY SCORE'] = file_subjectivity_scores[filename]

df.to_csv(csv_file_path, index=False)

Average number of words per sentence =  the total number of words / the total number of sentences


In [None]:
def calculate_average_words_per_sentence(directory_path):
    file_avg_words_per_sentence = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            word_count_ = count_words_in_file(file_path)
            num_sentences = count_sentences(file_path)
            average_length = word_count_ / num_sentences
            file_avg_words_per_sentence[filename] = average_length
    return file_avg_words_per_sentence


In [None]:
directory_path = 'cleaned_output'
directory_path2 = 'output'

file_avg_words_per_sentence = calculate_average_words_per_sentence(directory_path2)

print("\nAverage number of words per sentence:")
for filename, average_length in file_avg_words_per_sentence.items():
    print(f'{filename}: {average_length:.2f} words per sentence')


In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_avg_words_per_sentence:
        df.loc[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = file_avg_words_per_sentence[filename]

df.to_csv(csv_file_path, index=False)

Average sentence length = the number of words / the number of sentences

In [None]:
def calculate_average_sentence_length(words_directory_path, sentences_directory_path):
    file_avg_sentence_length = {}
    word_counts = count_words_in_directory(words_directory_path)
    sentence_counts = count_sentences_in_directory(sentences_directory_path)

    for filename in sentence_counts:
        word_count = word_counts.get(filename)  
        num_sentences = sentence_counts[filename]
        
        average_sentence_length = word_count / num_sentences
        
        file_avg_sentence_length[filename] = average_sentence_length
    return file_avg_sentence_length


In [None]:
words_directory_path = 'cleaned_output'
sentences_directory_path = 'output'

file_avg_sentence_length = calculate_average_sentence_length(words_directory_path, sentences_directory_path)

print("\nAverage Sentence Length:")
for filename, average_sentence_length in file_avg_sentence_length.items():
    print(f'{filename}: {average_sentence_length:.2f} words per sentence')

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_avg_sentence_length:
        df.loc[index, 'AVG SENTENCE LENGTH'] = file_avg_sentence_length[filename]

df.to_csv(csv_file_path, index=False)

Syllable count


In [None]:
def count_syllables(word):
    exceptions = ['es', 'ed']
    word = word.lower()
    for ending in exceptions:
        if word.endswith(ending):
            return 0
    
    vowels = "aeiou"
    count = 0
    last_char_was_vowel = False
    
    for char in word:
        if char in vowels:
            if not last_char_was_vowel:
                count += 1
            last_char_was_vowel = True
        else:
            last_char_was_vowel = False
            
    return count

In [None]:
def count_syllables_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = re.findall(r'\b\w+\b', text)
        total_syllables = sum(count_syllables(word) for word in words)
        return total_syllables


In [None]:
def process_files_in_directory(directory_path):
    file_syllable_counts = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            syllable_count = count_syllables_in_file(file_path)
            file_syllable_counts[filename] = syllable_count
    return file_syllable_counts

In [None]:
directory_path = 'cleaned_output'
file_syllable_counts = process_files_in_directory(directory_path)

for filename, syllable_count in file_syllable_counts.items():
    print(f"{filename}: {syllable_count} syllables")

Syllable count per word


In [None]:
def calculate_syllable_per_word(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
        words = re.findall(r'\b\w+\b', text)
        total_words_ = len(words)
        total_syllables = sum(count_syllables(word) for word in words)
        
        syllable_per_word = total_syllables / total_words_
        
        return syllable_per_word

In [None]:
def process_files_in_directory(directory_path):
    file_syllable_count_per_word = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            syllable_per_word = calculate_syllable_per_word(file_path)
            file_syllable_count_per_word[filename] = syllable_per_word
    return file_syllable_count_per_word

In [None]:
directory_path = 'cleaned_output' 
file_syllable_count_per_word = process_files_in_directory(directory_path)

for filename, syllable_per_word in file_syllable_count_per_word.items():
    print(f"{filename}: {syllable_per_word:.2f} syllables")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_syllable_count_per_word:
        df.loc[index, 'SYLLABLE PER WORD'] = file_syllable_count_per_word[filename]

df.to_csv(csv_file_path, index=False)

Complex words count


In [None]:
def count_complex_words_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = re.findall(r'\b\w+\b', text)
        complex_word_count = sum(1 for word in words if count_syllables(word) > 2)
        return complex_word_count

In [None]:
def process_files_in_directory(directory_path):
    file_complex_word_counts = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            complex_word_count = count_complex_words_in_file(file_path)
            file_complex_word_counts[filename] = complex_word_count
    return file_complex_word_counts


In [None]:
directory_path = 'cleaned_output' 
file_complex_word_counts = process_files_in_directory(directory_path)

for filename, complex_word_count in file_complex_word_counts.items():
    print(f"{filename}: {complex_word_count} complex words")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt"
    if filename in file_complex_word_counts:
        df.loc[index, 'COMPLEX WORD COUNT'] = file_complex_word_counts[filename]

df.to_csv(csv_file_path, index=False)

Percentage of complex words

In [None]:
def calculate_percentage_of_complex_words(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
        words = re.findall(r'\b\w+\b', text)
        complex_word_count = sum(1 for word in words if count_syllables(word) > 2)
        total_words = len(words)
        
        percentage_of_complex_words = (complex_word_count / total_words) * 100
      
        return percentage_of_complex_words

In [None]:
def process_files_in_directory(directory_path):
    file_complex_percentages = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            percentage_of_complex_words = calculate_percentage_of_complex_words(file_path)
            file_complex_percentages[filename] = percentage_of_complex_words
    return file_complex_percentages

In [None]:
directory_path = 'cleaned_output'
file_complex_percentages = process_files_in_directory(directory_path)

for filename, percentage_of_complex_words in file_complex_percentages.items():
    print(f"{filename}: Percentage of Complex Words = {percentage_of_complex_words:.2f}%")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt"  
    if filename in file_complex_percentages:
        df.loc[index, 'PERCENTAGE OF COMPLEX WORDS'] = file_complex_percentages[filename]

df.to_csv(csv_file_path, index=False)

Fog index

In [None]:
def calculate_fog_index(words_directory_path, sentences_directory_path):
    avg_sentence_lengths = calculate_average_sentence_length(words_directory_path, sentences_directory_path)
    fog_indices = {}
    
    for filename in avg_sentence_lengths:
        avg_sentence_length = avg_sentence_lengths[filename]
        file_path = os.path.join(words_directory_path, filename)
        percent_complex_words = calculate_percentage_of_complex_words(file_path)
        fog_index = 0.4 * (avg_sentence_length + percent_complex_words)
        fog_indices[filename] = fog_index
    
    return fog_indices

words_directory_path = 'cleaned_output'
sentences_directory_path = 'output'
fog_indices = calculate_fog_index(words_directory_path, sentences_directory_path)

for filename, fog_index in fog_indices.items():
    print(f'Fog Index for {filename}: {fog_index:.2f}')


In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in fog_indices:
        df.loc[index, 'FOG INDEX'] = fog_indices[filename]

df.to_csv(csv_file_path, index=False)

Personal pronouns in output dir

In [None]:
def count_personal_pronouns(text):
    pronoun_pattern = r'\b(I|we|my|ours|us)\b'
    pronouns = re.findall(pronoun_pattern, text, re.IGNORECASE)
    filtered_pronouns = [pronoun for pronoun in pronouns if pronoun.lower() != 'us' or not re.search(r'\bUS\b', text)]
    return len(filtered_pronouns)

In [None]:
def calculate_personal_pronouns(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
        total_pronouns = count_personal_pronouns(text)
        return total_pronouns

In [None]:
def process_files_in_directory(directory_path):
    files_personal_pronoun_count = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            total_pronouns = calculate_personal_pronouns(file_path)
            files_personal_pronoun_count[filename] = total_pronouns
    
    return files_personal_pronoun_count

In [None]:
directory_path = 'output' 
file_personal_pronoun_count = process_files_in_directory(directory_path)

for filename, total_pronouns in file_personal_pronoun_count.items():
    print(f"{filename}: Total Personal Pronouns = {total_pronouns}")

In [None]:
for index, row in df.iterrows():
    filename = f"{row['URL_ID']}.txt" 
    if filename in file_personal_pronoun_count:
        df.loc[index, 'PERSONAL PRONOUNS'] = file_personal_pronoun_count[filename]

df.to_csv(csv_file_path, index=False)

In [None]:
df