In [8]:
import openpyxl
import requests
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('cmudict')
from nltk.corpus import cmudict
from nltk.tokenize import sent_tokenize
import string

d = cmudict.dict()

def load_words_from_file(file_path):
    with open(file_path, 'r') as file:
        words = re.split(r'\s|\|', file.read())
    return words

def count_positive_words(content, positive_words):
    words = content.split()
    pc = sum(1 for word in words if word.lower() in positive_words)
    return pc


def count_negative_words(content, negative_words):
    words = content.split()
    nc = sum(1 for word in words if word.lower() in negative_words)
    return nc
def count_syllables(word):
    word = word.lower()
    
    if word in d:
      
        phonemes = d[word][0]
       
        syllable_count = sum(1 for phoneme in phonemes if phoneme[-1].isdigit())
        return syllable_count
    return 0


def gunning_fox_index(content):
    words = nltk.word_tokenize(content)
   
    total_syllables = sum(count_syllables(word) for word in words)
    sentences = sent_tokenize(content)
    total_words = len(words)
    total_sentences = len(sentences)
    avgsl = total_words / total_sentences
    sypw=total_syllables/total_words

    cmpxwc = sum(1 for word in words if count_syllables(word) > 2)
    prcntcw= cmpxwc/total_words * 100

    avgnwps = total_words / total_sentences
    percentage_complex_words = (cmpxwc / total_words) * 100
    fogi = 0.4 * (avgnwps + percentage_complex_words)

    return fogi, total_words, cmpxwc, avgnwps, sypw, avgsl,prcntcw


def count_personal_pronouns(content):
    personal_pronouns = ['I', 'we','We' 'my','My', 'ours','Ours', 'us']
    words = nltk.word_tokenize(content)
    prsnlpc = sum(1 for word in words if word.lower() in personal_pronouns)
    return prsnlpc


def loadsw(folder_path):
    stop_words = set()
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            words = file.read().splitlines()
            stop_words.update(words)
    return stop_words

def extract_c(url):
    response = requests.get(url)
    f = BeautifulSoup(response.content, 'html.parser')
    te = f.find('h1', {'class': 'entry-title'})
    title = te.text.strip() if te else 'N/A'
    ce = f.find('div', {'class': 'td-post-content tagdiv-type'})
    cp = ce.find_all('p')if ce else []
    content=""
    for p in cp:
        para = str(p.text.strip())
        content+= para + "\n"
    final = title + "\n"+ content
    return final
data_array = []
wb = openpyxl.load_workbook("Input.xlsx")
sheet = wb["Sheet1"]
url=[]
for r in sheet.iter_rows(values_only=True):
    data_array.append(list(r))
for i in data_array:
    if i[1]=="URL":
        continue
    url.append(i[1])

stop_words_folder = "StopWords"  
stop_words = loadsw(stop_words_folder)

master_dict_folder = "MasterDictionary"
positive_words = load_words_from_file(os.path.join(master_dict_folder, "positive-words.txt"))
negative_words = load_words_from_file(os.path.join(master_dict_folder, "negative-words.txt"))

data=[]
for u in url:
    content = extract_c(u)

    
    words = content.split()

    
    cleaned_words = [word for word in words if word.lower() not in stop_words]

    
    cleaned_content = " ".join(cleaned_words)

    
    pc = count_positive_words(cleaned_content, positive_words)
    nc = count_negative_words(cleaned_content, negative_words)

    
    total_words_after_cleaning = len(cleaned_words) + 0.000001
    pls = (pc - nc) / (pc + nc + 0.000001)
    subs = (pc + nc) / total_words_after_cleaning

    
    fogi, wc, cmpxwc, avgnwps, sypw, avgsl,prcntcw = gunning_fox_index(cleaned_content)
    prsnlpc = count_personal_pronouns(cleaned_content)
    avgwl = sum(len(word) for word in cleaned_words) / len(cleaned_words)

    
    data.append((u, pc, nc, pls, subs,avgsl,prcntcw ,fogi,avgnwps,
                 cmpxwc, wc, sypw, prsnlpc, avgwl))

output_data_structure_file = "Output Data Structure.xlsx"  
output_data_structure_df = pd.read_excel(output_data_structure_file)


column_names = output_data_structure_df.columns[1:].tolist()


data_with_url_id = []


for url_id, row in zip(output_data_structure_df["URL_ID"], data):
    data_with_url_id.append([url_id] + list(row))


result_df = pd.DataFrame(data_with_url_id, columns=["URL_ID"] + column_names)


result_file = "Output.xlsx"  
result_df.to_excel(result_file, index=False)

[nltk_data] Downloading package punkt to C:\Users\AMAN
[nltk_data]     DIXIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to C:\Users\AMAN
[nltk_data]     DIXIT\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [3]:
print(data[1])

('https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/', 47, 28, 0.25333332995555563, 0.11363636346418733, 13.189896225339263, 11.240506329113924, 193, 888, 0, 6.984848484848484)
