In [11]:
import os
import random
import re
import warnings

import pandas as pd
import requests
import syllables
from bs4 import BeautifulSoup
from nltk.corpus import stopwords as nltk_sw

warnings.filterwarnings("ignore")

In [12]:
df = pd.read_excel("Input.xlsx")
urls = list(df["URL"])
urls[:10], len(urls)

(['https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/',
  'https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/',
  'https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/',
  'https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/',
  'https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/',
  'https://insights.blackcoffer.com/man-and-machines-together-machines-are-more-diligent-than-humans-blackcoffe/',
  'https://insights.blackcoffer.com/in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work/',
  'https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/',
  'https://insights.blackcoffer.com/how-machine-learning-will-affect-your-business/',
  'https://insights.blackcoffer.com/deep-learning-impact-on-areas-of-e-learning/'],
 114)

In [13]:
def fetch_web_data(url):
    class_ = ["td-post-content tagdiv-type", "tdb-block-inner td-fix-index"]
    doc = requests.get(url)
    soup = BeautifulSoup(doc.content, "html.parser")
    title = soup.find("h1")
    article = soup.find_all("div", {"class": class_[0]})
    if article:
        res = " "
        for tag in article:
            res += tag.text.strip()
    else:
        article = soup.find_all("div", {"class": class_[1]})
        res = " "
        for tag in article:
            res += tag.text.strip()
    try:
        start = res.index("Introduction")
        stop = res.index("Blackcoffer Insights")
    except:
        start = 0
        stop = -1
    return title.text + "\n" + res[start:stop]


fetch_web_data(random.choice(urls))[:500]

'Lessons from the past: Some key learnings relevant to the coronavirus crisis\n “The more you know about the past, the better prepared you are for the future.”Theodore RooseveltAs we speak, the world finds itself engulfed in one of its worst crises in recent times. The global COVID-19 pandemic has caused never-seen-before disruption in both public and economic life. Not only have factories shut down or supply chains abruptly stopped or millions of workers stranded, but festivals suspended, familie'

In [14]:
def get_stop_words():
    StopWords_notNames = []
    for file in os.listdir("StopWords/"):
        if file != "StopWords_Names.txt":
            corpus = open(f"StopWords/{file}", "r").read().strip(" ").split("\n")
            res = []
            for txt in corpus:
                if "|" in txt:
                    res.extend(txt.replace(" | ", ",").replace(" ", "").split(","))
            if res != []:
                StopWords_notNames.extend(res)

    StopWords_Names = []
    for file in os.listdir("StopWords/"):
        if file == "StopWords_Names.txt":
            corpus = open(f"StopWords/{file}", "r").read().strip(" ").split("\n")
            for txt in corpus:
                if "|" in txt:
                    res = txt.replace(" | ", ",").replace(" ", "").split(",")
                    if res != None:
                        StopWords_Names.append(res[0])

    stop_words = []
    for file in os.listdir("StopWords/"):
        corpus = open(f"StopWords/{file}", "r").read().strip().split("\n")
        res = []
        for txt in corpus:
            if "|" in txt:
                txt = txt.replace(txt, txt.split("|")[0])
                res.append(txt.strip())
        if res != []:
            stop_words.extend(res)
        stop_words.extend([txt for txt in corpus if "|" not in txt])

    stop_words.extend(StopWords_notNames)
    stop_words.extend(StopWords_Names)
    return stop_words


get_stop_words()[:10]

['ERNST',
 'YOUNG',
 'DELOITTE',
 'TOUCHE',
 'KPMG',
 'PRICEWATERHOUSECOOPERS',
 'PRICEWATERHOUSE',
 'COOPERS',
 'AFGHANI',
 'ARIARY']

In [15]:
def clean_stop_words(text, personalwords=True):
    stop_words=get_stop_words()
    if personalwords == True:
        stop_words.extend(nltk_sw.words("english"))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = " ".join(cleaned_words)
    cleaned_text = " ".join(re.findall("[a-zA-Z.]+", cleaned_text))
    return cleaned_text


clean_stop_words(fetch_web_data(random.choice(urls)))[:500]

'Estimating impact COVID world work COVID unprecedented pandemic Can be possibility great leaders bill gates action rising year . corollary pandemic prodigious analysis Department Economic Social Affairs DESA stated COVID pandemic disrupting global supply chains international trade turn shrink global economy percent reversal previous forecast . percent growth. . million Americans filed unemployment claims economic downturn expected worst recession Great Depression stated IMF. India facing biggest'

In [16]:
def get_scores(text):
    def get_subjectivity_score(text):
        num_words = len(text.split())
        unique_words = len(set(text.split()))
        subjectivity_score = unique_words / num_words
        return subjectivity_score

    def get_polarity_score(text):
        positive_words = (
            open("MasterDictionary/positive-words.txt", "r").read().split("\n")
        )
        negative_words = (
            open("MasterDictionary/negative-words.txt", "r").read().split("\n")
        )

        positive_count = 0
        negative_count = 0

        for word in text.split():
            if word.lower() in positive_words:
                positive_count += 1
            elif word.lower() in negative_words:
                negative_count += 1

        polarity_score = (positive_count - negative_count) / (
            positive_count + negative_count + 1
        )
        return polarity_score, positive_count, negative_count

    subjectivity_score = get_subjectivity_score(text)
    polarity_score, positive_count, negative_count = get_polarity_score(text)

    return positive_count, negative_count, polarity_score, subjectivity_score


get_scores(clean_stop_words(fetch_web_data(random.choice(urls))))

(36, 54, -0.1978021978021978, 0.754653130287648)

In [27]:
def Analysis_of_readability(fetched_article):
    sentences = fetched_article.replace(" ", "").split(".")
    tokens = fetched_article.split(" ")
    total_num_of_sentences = len(sentences)
    total_num_of_words = len(tokens)

    num_complex_words = 0
    for token in sentences:
        if syllables.estimate(token) > 2:
            num_complex_words += 1

    Average_Sentence_Length = total_num_of_words / total_num_of_sentences
    Percentage_of_Complex_words = num_complex_words / total_num_of_words
    Fog_Index = 0.4 * (Average_Sentence_Length + Percentage_of_Complex_words)

    Average_Number_of_Words_Per_Sentence = total_num_of_words / total_num_of_sentences

    total_syllables = sum(syllables.estimate(word) for word in sentences)
    SYLLABLE_PER_WORD = total_syllables / total_num_of_words
    SYLLABLE_PER_WORD

    return (
        num_complex_words,
        Average_Sentence_Length,
        Percentage_of_Complex_words,
        Fog_Index,
        Average_Number_of_Words_Per_Sentence,
        SYLLABLE_PER_WORD,
    )


Analysis_of_readability(clean_stop_words(fetch_web_data(random.choice(urls))))

(51,
 10.076923076923077,
 0.09732824427480916,
 4.069700528479155,
 10.076923076923077,
 2.2118320610687023)

In [18]:
def get_personal_pronouns(tokens):
    personal_pronouns = [
        "I",
        "me",
        "my",
        "mine",
        "you",
        "your",
        "yours",
        "he",
        "him",
        "his",
        "she",
        "her",
        "hers",
        "it",
        "its",
        "we",
        "us",
        "our",
        "ours",
        "they",
        "them",
        "their",
        "theirs",
    ]
    num_personal_pronouns = sum(
        [1 for word in tokens if word.lower() in personal_pronouns]
    )

    total_chars = sum(len(word) for word in tokens)
    avg_word_length = total_chars / len(tokens)

    return num_personal_pronouns, avg_word_length


corpus = clean_stop_words(fetch_web_data(random.choice(urls)),personalwords=False)

res = re.findall("[A-Za-z]+", corpus)
get_personal_pronouns(res)

(4, 6.531578947368421)

In [19]:
id_r = []
url_r = []
pos_score_r = []
neg_score_r = []
Polarity_Score_r = []
Polarity_Score_r = []
Subjectivity_Score_r = []
Average_Sentence_Length_r = []
Percentage_of_Complex_words_r = []
Fog_Index_r = []
Average_Number_of_Words_Per_Sentence_r = []
num_complex_words_r = []
total_num_of_words_r = []
SYLLABLE_PER_WORD_r = []
num_personal_pronouns_r = []
avg_word_length_r = []


# Iterating through URLS
for n in range(len(urls)):
    try:
        # fetch_web_data
        fetched_article = fetch_web_data(urls[n])
    except:
        print(f"Page {urls[n]} Not Found....!")
        continue
    index = df.iloc[n]
    id_ = index[0]
    url_ = index[1]

    # clean_stop_words
    tokens = clean_stop_words(fetched_article)
    total_num_of_words = len(tokens)

    pos_score, neg_score, Polarity_Score, Subjectivity_Score = get_scores(tokens)

    (
        num_complex_words,
        Average_Sentence_Length,
        Percentage_of_Complex_words,
        Fog_Index,
        Average_Number_of_Words_Per_Sentence,
        SYLLABLE_PER_WORD,
    ) = Analysis_of_readability(fetched_article)

    tmp=clean_stop_words(fetched_article,personalwords=False)
    res = re.findall("[A-Za-z]+", tmp)
    num_personal_pronouns, avg_word_length = get_personal_pronouns(res)

    # Appending obtained variables into respective lists
    id_r.append(id_)
    url_r.append(url_)
    pos_score_r.append(pos_score)
    neg_score_r.append(neg_score)
    Polarity_Score_r.append(Polarity_Score)
    Subjectivity_Score_r.append(Subjectivity_Score)
    Average_Sentence_Length_r.append(Average_Sentence_Length)
    Percentage_of_Complex_words_r.append(Percentage_of_Complex_words)
    Fog_Index_r.append(Fog_Index)
    Average_Number_of_Words_Per_Sentence_r.append(Average_Number_of_Words_Per_Sentence)
    num_complex_words_r.append(num_complex_words)
    total_num_of_words_r.append(total_num_of_words)
    SYLLABLE_PER_WORD_r.append(SYLLABLE_PER_WORD)
    num_personal_pronouns_r.append(num_personal_pronouns)
    avg_word_length_r.append(avg_word_length)


output = {
    "URL_ID": id_r,
    "POSITIVE SCORE": pos_score_r,
    "NEGATIVE SCORE": neg_score_r,
    "POLARITY SCORE": Polarity_Score_r,
    "SUBJECTIVITY SCORE": Subjectivity_Score_r,
    "AVG SENTENCE LENGTH": Average_Sentence_Length_r,
    "PERCENTAGE OF COMPLEX WORDS": Percentage_of_Complex_words_r,
    "FOG INDEX": Fog_Index_r,
    "AVG NUMBER OF WORDS PER SENTENCE": Average_Number_of_Words_Per_Sentence_r,
    " COMPLEX WORD COUNT": num_complex_words_r,
    "WORD COUNT ": total_num_of_words_r,
    "SYLLABLE PER WORD ": SYLLABLE_PER_WORD_r,
    "PERSONAL PRONOUNS": num_personal_pronouns_r,
    "AVG WORD LENGTH": avg_word_length_r,
}

output_df = pd.DataFrame(output).set_index("URL_ID")
output_df

Page https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ Not Found....!
Page https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ Not Found....!
Page https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/ Not Found....!


Unnamed: 0_level_0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
37,66,29,0.385417,0.676136,22.474359,0.043925,9.007313,22.474359,77,8877,1.954934,3,7.204651
38,55,32,0.261364,0.682997,20.085714,0.047653,8.053347,20.085714,67,5208,1.645092,17,6.315341
39,62,33,0.302083,0.694725,19.658824,0.050269,7.883637,19.658824,84,7651,1.912029,5,7.353524
40,61,19,0.518519,0.680108,19.626506,0.050952,7.870983,19.626506,83,5859,1.686924,2,6.626316
41,58,21,0.462500,0.723042,20.890244,0.047869,8.375245,20.890244,82,7035,1.751313,9,6.855530
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,22,28,-0.117647,0.805501,16.370370,0.058824,6.571678,16.370370,52,4136,1.843891,3,6.848659
147,38,14,0.452830,0.635120,18.554217,0.051948,7.442466,18.554217,80,6706,1.706494,20,6.614499
148,27,43,-0.225352,0.610248,15.671233,0.057692,6.291570,15.671233,66,5100,1.799825,9,6.746154
149,35,7,0.651163,0.770026,22.064516,0.045322,8.843935,22.064516,31,3509,2.067251,5,7.810606


In [28]:
output_df.to_excel("Output Data Structure.xlsx")