In [None]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import string
from syllables import estimate as count
import pandas as pd
import re
import os

nltk.download('punkt')

def load_stopwords_from_file(file_path):
    with open(file_path, 'r') as file:
        return set(file.read().split())

stopwords_file = 'StopWords.txt'

all_stopwords = load_stopwords_from_file(stopwords_file)

with open('positive-words.txt') as pos_file:
    pos_words = set(pos_file.read().split())

with open('negative-words.txt') as neg_file:
    neg_words = set(neg_file.read().split())

def count_vowels(word):
    vowels = "AEIOUaeiou"
    return sum(1 for char in word if char in vowels)

data = []

with open('url.txt', 'r') as file:
    url_list = [line.strip() for line in file]

for url in url_list:
    title = ''
    content = ''
    positive_score = 0
    negative_score = 0
    polarity_score = 0
    subjectivity_score = 0
    avg_sentence_length = 0
    percentage_complex_words = 0
    fog_index = 0
    avg_num_of_words_per_sentence = 0
    complex_word_count = 0
    word_count = 0
    avg_word_length = 0
    syllable_count_per_word = 0
    personal_pronouns = 0

    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        article = soup.find('article')
        title = article.find('h1').text.strip()
        paras = [p.text.strip() for p in article.find_all('p')]
        content = ' '.join(paras)

        words = [word.lower() for para in paras for word in word_tokenize(para) if word.lower() not in all_stopwords and word not in string.punctuation]

        for word in words:
            if word in pos_words:
                positive_score += 1
            elif word in neg_words:
                negative_score += 1

        word_count = len(words)
        polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)

        sentences = sent_tokenize(content)
        total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
        avg_sentence_length = total_words / len(sentences)
        complex_words = [word for word in words if count_vowels(word) > 2]
        percentage_complex_words = len(complex_words) / word_count
        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

        avg_num_of_words_per_sentence = word_count / len(sentences)

        complex_word_count = len(complex_words)

        syllable_count_per_word = sum(count(word) for word in words) / word_count

        personal_pronouns_pattern = re.compile(r'\b(I|me|my|you|your|he|him|his|she|her|it|its|we|us|our|they|them|their)\b', re.IGNORECASE)
        personal_pronouns = len(re.findall(personal_pronouns_pattern, ' '.join(words)))

        avg_word_length = sum(len(word) for word in words) / word_count

    except Exception as e:
        print(f"An error occurred while processing URL: {url}")
        print(e)

    data.append({
        'Title': title,
        'Content': content,
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Average Sentence Length': avg_sentence_length,
        'Percentage of Complex Words': percentage_complex_words,
        'Fog Index': fog_index,
        'Average Number of Words Per Sentence': avg_num_of_words_per_sentence,
        'Complex Word Count': complex_word_count,
        'Word Count': word_count,
        'Syllable Count Per Word': syllable_count_per_word,
        'Personal Pronouns': personal_pronouns,
        'Average Word Length': avg_word_length
    })

df = pd.DataFrame(data)
df.to_excel('Output.xlsx', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayan0\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


An error occurred while processing URL: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
An error occurred while processing URL: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
