In [92]:
## importing all dependincies
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import spacy
# make sure to have spacy english nlp model downloaded if not run "python -m spacy download en_core_web_sm" to download
import codecs
from textblob import TextBlob as tb

In [93]:
## setting input variable to the location of input File provided in EXCEL format
input_filepath = "./files/Input.xlsx"
## reading excel files using pandas in to a dataframe
df = pd.read_excel(input_filepath)

In [4]:
##  make a function that scrapes given url and saves that perticular url's content in a text file on the system.
def scrap_url(url):
    try:
        res = requests.get(url)
        res.raise_for_status()  # Raise an exception for non-200 status codes

        html = res.text
        soup = BeautifulSoup(html, "lxml")

        title = soup.find("h1", class_="entry-title")
        if title:
            title = title.text.strip()

        content_div = soup.find("div", class_="td-post-content") or soup.find("div", class_="td-main-content")
        if not content_div:
            page = soup.find("div", class_="wpb_wrapper")
            if page:
                title = page.find("h1", class_="tdb-title-text")
                if title:
                    title = title.text.strip()
                content_div = page.find("div", class_="tdb-block-inner")

        paragraphs = ""
        if content_div:
            for pre in content_div.find_all("pre"):  # Remove all <pre> tags
                pre.decompose()
            paragraphs = content_div.get_text(separator=" ", strip=True)

        if title and paragraphs:
            url_id = df["URL_ID"][row]
            print(url_id)
            with open(url_id, "w") as tosave:
                tosave.write(title + " " + paragraphs)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {url} ({e})")
    except Exception as e:
        print(f"Unexpected error: {e}")

In [None]:
for row in range(len(df)):
    url = df['URL'][row]
    scrap_url(row)

In [90]:
## a function to get all stop words provided along the assignment in python list
def get_stop_words():
    stop_words = []
    filenames = ["StopWords_Auditor.txt", "StopWords_Currencies.txt", "StopWords_DatesandNumbers.txt", "StopWords_GenericLong.txt", "StopWords_Generic.txt", "StopWords_Geographic.txt", "StopWords_Names.txt"]
    for filename in filenames:
        try:
            with open(f"./files/StopWords/{filename}", "r") as file:
                contents = file.read()
                if "|" in contents:
                    stop_words.append(contents.lower().replace("|","\n") .split(sep="\n"))
                else:
                    stop_words.append(contents.lower().split(sep="\n"))
        except UnicodeDecodeError:
            with open(f"./files/StopWords/{filename}", "r", encoding="iso-8859-1") as file:
                contents = file.read()
                if "|" in contents:
                    stop_words.append(contents.lower().replace("|","\n") .split(sep="\n"))
                else:
                    stop_words.append(contents.lower().split(sep="\n"))
    flat_list = sum(stop_words, [])
    flat_list = [item.strip() for item in flat_list]
    return flat_list


stop_words = get_stop_words()

In [7]:
## a function to get all negative words provided along the assignment in python list
def get_negative_words():
    negative_words = []
    with open("./files/MasterDictionary/negative-words.txt","r", encoding="cp1252") as temp:
        t = temp.read()
        negative_words = t.splitlines()
        negative_words = [item.strip() for item in negative_words]
    return negative_words

negative_words  = get_negative_words()


In [91]:
## a function to get all postive words provided along the assignment in python list

def get_positive_words():
    positive_words = []
    with open("./files/MasterDictionary/positive-words.txt","r") as temp:
        t = temp.read()
        positive_words = t.splitlines()
        positive_words = [item.strip() for item in positive_words]
    return positive_words

positive_words  = get_positive_words()

In [None]:
new_negatives = []
new_positives = []

for element in negative_words:
    if element not in stop_words:
        new_negatives.append(element)

for element in positive_words:
    if element not in stop_words:
        new_positives.append(element)

In [10]:
## get the paragraphs text from the sraped text file with "url_id" as name
def get_scrapped_paragraphs(row):
    paragraphs = ""
    url_id = df["URL_ID"][row]
    try:
        with open(f"./{url_id}","r") as file:
           paragraphs = file.read()
    except FileNotFoundError:
        pass
    return paragraphs

In [12]:
def func_positive_score(texts):
    return sum(word in texts.lower().split() for word in new_positives)


def func_negative_score(texts):
    return sum(word in texts.lower().split() for word in new_negatives)


def func_polarity_score(texts):
    texts = tb(texts)
    return texts.sentiment[0]


def func_subjectivity_score(texts):
    texts = tb(texts)
    return texts.sentiment[1]


def func_avg_sentence_length(texts):
  sentences = texts.sentences
  total_words = sum(len(sentence.words) for sentence in sentences)
  return total_words / len(sentences) if len(sentences) else 0


def func_count_syllables(word):
  vowels = "aeiouy"
  word = word.lower()
  syllable_count = 0
  for i in range(len(word)):
      if word[i] in vowels:
          syllable_count += 1
          if i != len(word) - 1 and word[i + 1] in vowels and word[i] != "e":
              syllable_count -= 1
  return syllable_count


def func_complex_word_percentage(texts):
  words = texts.words
  total_words = len(words)
  complex_words = sum(func_count_syllables(word) >= 3 for word in words)
  return (complex_words / (total_words)) * 100


def func_fog_index(texts):
  avg_sentence_len = func_avg_sentence_length(texts)
  percentage_complex_words = func_complex_word_percentage(texts)
  return 0.4 * (avg_sentence_len + percentage_complex_words)


def func_avg_words_per_sentence(text):
  sentences = texts.sentences
  return sum(len(sentence.words) for sentence in sentences) / len(sentences) if len(sentences) else 0


def func_complex_word_count(texts):
  words = texts.words
  return sum(func_count_syllables(word) >= 3 for word in words)


def func_word_count(texts):
  return len(texts.words)


def func_syllable_per_word(texts):
  words = texts.words
  total_syllables = sum(func_count_syllables(word) for word in words)
  return total_syllables / len(words) if len(words) else 0


def func_avg_word_length(texts):
  words = texts.words
  return (sum(len(word) for word in words)) / len(words) if len(words) else 0


def func_personal_pronouns(texts):
    nlp = spacy.load("en_core_web_sm")  # Load the English NLP model
    doc = nlp(texts)
    personal_pronouns = ["I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "myself", "yourself", "himself", "herself", "ourselves", "yourselves", "themselves"]
    pronoun_count = 0
    for token in doc:
        if token.text.lower() in personal_pronouns and not token.ent_type_:  # Check for personal pronoun and no named entity
            pronoun_count += 1
    return pronoun_count

In [41]:

# TODO: remove stopword form positive and negative word list




In [81]:
output_filepath = "./output.xlsx"
ouput_df = pd.read_excel(output_filepath)
columns = list(ouput_df.columns)

blackassign0001
https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
blackassign0002
https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
blackassign0003
https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
blackassign0004
https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
blackassign0005
https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
blackassign0006
https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-i

['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']


In [83]:
for row in range(len(df)):
    paragraphs = get_scrapped_paragraphs(row)

    if paragraphs:
        print(f"calculating scores for {df['URL_ID'][row]} :")
        paras = " ".join(word for word in paragraphs.split() if word not in stop_words)
        texts = tb(paragraphs)
        positive_score = func_positive_score(paras)
        negative_score = func_negative_score(paras)
        polarity = func_polarity_score(paras)
        subjectivity = func_subjectivity_score(paras)
        avg_sentence_length = func_avg_sentence_length(texts)
        syllable_count = func_count_syllables(texts)
        complex_word_percentage = func_complex_word_percentage(texts)
        avg_word_length = func_avg_word_length(texts)
        personal_pronouns = func_personal_pronouns(paragraphs)
        syllable_per_word = func_syllable_per_word(texts)
        word_count = func_word_count(texts)
        complex_word_count = func_complex_word_count(texts)
        avg_words_per_sentence = func_avg_words_per_sentence(texts)
        fog_index = func_fog_index(texts)
        scores_dict = {'URL_ID': ouput_df['URL_ID'][row] , 'URL': ouput_df['URL'][row], 'POSITIVE SCORE': positive_score, 'NEGATIVE SCORE': negative_score, 'POLARITY SCORE': polarity, 'SUBJECTIVITY SCORE': subjectivity, 'AVG SENTENCE LENGTH': avg_sentence_length, 'PERCENTAGE OF COMPLEX WORDS': complex_word_percentage , 'FOG INDEX':fog_index , 'AVG NUMBER OF WORDS PER SENTENCE':avg_words_per_sentence , 'COMPLEX WORD COUNT':complex_word_count , 'WORD COUNT':word_count , 'SYLLABLE PER WORD':syllable_per_word , 'PERSONAL PRONOUNS':personal_pronouns , 'AVG WORD LENGTH': avg_word_length}
        for key,value in scores_dict.items():
            ouput_df.at[row, key] = value


for blackassign0001 values are:
for blackassign0002 values are:
for blackassign0003 values are:
for blackassign0004 values are:
for blackassign0005 values are:
for blackassign0006 values are:
for blackassign0007 values are:
for blackassign0008 values are:
for blackassign0009 values are:
for blackassign0010 values are:
for blackassign0011 values are:
for blackassign0012 values are:
for blackassign0013 values are:
for blackassign0015 values are:
for blackassign0016 values are:
for blackassign0017 values are:
for blackassign0018 values are:
for blackassign0019 values are:
for blackassign0021 values are:
for blackassign0022 values are:
for blackassign0023 values are:
for blackassign0024 values are:
for blackassign0025 values are:
for blackassign0026 values are:
for blackassign0027 values are:
for blackassign0028 values are:
for blackassign0030 values are:
for blackassign0031 values are:
for blackassign0032 values are:
for blackassign0033 values are:
for blackassign0034 values are:
for blac

In [89]:
try:
    with pd.ExcelWriter('my_outpul.xlsx') as writer:
        ouput_df.to_excel(writer,sheet_name='Sheet1')
except Exception as e:
  
    raise e

In [88]:
ouput_df
# for row in range(98,100):
#     url = df['URL'][row]
#     print(url)
#     scrap_url(url)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,13.0,5.0,0.242714,0.603379,15.769231,16.504065,12.909318,15.769231,203.0,1230.0,1.710569,56.0,4.555285
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,34.0,19.0,0.077790,0.457205,18.962025,28.571429,19.013382,18.962025,428.0,1498.0,2.000000,22.0,5.389853
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,23.0,13.0,0.066737,0.396218,19.196429,38.139535,22.934385,19.196429,410.0,1075.0,2.255814,17.0,6.086512
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,23.0,52.0,0.004925,0.380975,20.803922,35.815269,22.647676,20.803922,380.0,1061.0,2.177191,13.0,5.918944
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,15.0,7.0,0.059091,0.493485,17.666667,28.011611,18.271311,17.666667,193.0,689.0,1.956459,12.0,5.509434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,17.0,33.0,0.034505,0.345489,22.600000,26.283186,19.553274,22.600000,297.0,1130.0,1.894690,5.0,5.230973
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,14.0,26.0,0.038921,0.429680,28.921053,21.201092,20.048858,28.921053,233.0,1099.0,1.696087,27.0,4.623294
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,3.0,2.0,0.107535,0.406875,18.500000,28.255528,18.702211,18.500000,115.0,407.0,1.889435,4.0,5.353808
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,,,,,,,,,,,,,
