In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from pyvirtualdisplay import Display
from bs4 import BeautifulSoup

import time
import json

# Data Extraction

In [2]:
inputFile = pd.read_excel('Input.xlsx')

inputFile

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...
110,147,https://insights.blackcoffer.com/the-future-of...
111,148,https://insights.blackcoffer.com/big-data-anal...
112,149,https://insights.blackcoffer.com/business-anal...


In [3]:
inputFile.iloc[1]

URL_ID                                                   38
URL       https://insights.blackcoffer.com/what-if-the-c...
Name: 1, dtype: object

In [4]:
inputFile.iloc[0]

URL_ID                                                   37
URL       https://insights.blackcoffer.com/ai-in-healthc...
Name: 0, dtype: object

In [5]:
url = inputFile.iloc[1]['URL']

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(10)

driver.get(url)
time.sleep(5)

page = driver.page_source
driver.quit()

print('Done')

[WDM] - Downloading: 100%|█████████████████| 6.83M/6.83M [00:00<00:00, 13.6MB/s]
  driver = webdriver.Chrome(ChromeDriverManager().install())


Done


In [6]:
soup = BeautifulSoup(page, 'html.parser')

In [7]:
article_title = soup.find_all('h1', class_='entry-title')[0].text

In [8]:
article_title

'What if the Creation is Taking Over the Creator?'

In [9]:
article_div = soup.find_all('div', class_='td-ss-main-content')

In [10]:
article_paras = []

for d in article_div:
    article_paras.extend(d.find_all('p'))
    
article_paras = [x.text for x in article_paras]

In [11]:
article_paras

['Human minds, a fascination in itself carrying the potential of tinkering nature with the pixie dust intelligence, creating and solving the mysteries and wonders with anything but admiration. However, no matter how captivating a human mind can be, it could sometimes be appalled. It could be the hunger or maybe the desire to want more, to go beyond and unravel the limitations, or maybe something like pure greed. Humans have never stopped and always keep evolving when it comes to intelligence and this is what makes them the supreme.',
 'Intelligence calls out for supremacy and so, what if there was to evolve something that opposed a challenge to the very human minds, to their capabilities while making them question their own importance among themselves? Artificial Intelligence came as a revolution, havoc when it first came to the light. The concept of making machines does work on their own, like granting machines –The Intelligence.',
 'The idea of making machines work like humans came b

In [12]:
article_text = ' '.join(article_paras)

article_text

'Human minds, a fascination in itself carrying the potential of tinkering nature with the pixie dust intelligence, creating and solving the mysteries and wonders with anything but admiration. However, no matter how captivating a human mind can be, it could sometimes be appalled. It could be the hunger or maybe the desire to want more, to go beyond and unravel the limitations, or maybe something like pure greed. Humans have never stopped and always keep evolving when it comes to intelligence and this is what makes them the supreme. Intelligence calls out for supremacy and so, what if there was to evolve something that opposed a challenge to the very human minds, to their capabilities while making them question their own importance among themselves? Artificial Intelligence came as a revolution, havoc when it first came to the light. The concept of making machines does work on their own, like granting machines –The Intelligence. The idea of making machines work like humans came back in th

In [13]:
with open("tempFile.txt","w") as f:
    f.writelines(article_title + '\n\n')
    f.writelines('\n'.join(article_paras))

# Data Analysis

### Clean using Stop Words List

In [25]:
with open("StopWords/StopWords_Auditor.txt", "r", encoding="utf-8-sig") as file:
    words = [line.split()[0].lower() for line in file]

In [26]:
words

['ernst',
 'young',
 'deloitte',
 'touche',
 'kpmg',
 'pricewaterhousecoopers',
 'pricewaterhouse',
 'coopers']

In [29]:
import os

directory = "./StopWords" 

stop_words = []

for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r", encoding="utf-8-sig") as file:
            words = [line.split()[0].lower() for line in file]
            stop_words.extend(words)

len(stop_words)

14107

In [53]:
# article_text

In [54]:
text = article_text.lower()

In [55]:
# text

In [56]:
# the string '\xa0' is not a part of the original text... so removing it
# also removing the punctuations

import spacy
import string

def preprocess_text(text):
    nlp = spacy.load("en_core_web_sm")

    # Remove '\xa0' from the text
    text = text.replace('\xa0', '')

    doc = nlp(text)
    punctuations = string.punctuation
    tokens = [token for token in doc if not token.is_punct]
    clean_text = " ".join([token.text for token in tokens])

    return clean_text

text = preprocess_text(text)

In [57]:
text

'human minds a fascination in itself carrying the potential of tinkering nature with the pixie dust intelligence creating and solving the mysteries and wonders with anything but admiration however no matter how captivating a human mind can be it could sometimes be appalled it could be the hunger or maybe the desire to want more to go beyond and unravel the limitations or maybe something like pure greed humans have never stopped and always keep evolving when it comes to intelligence and this is what makes them the supreme intelligence calls out for supremacy and so what if there was to evolve something that opposed a challenge to the very human minds to their capabilities while making them question their own importance among themselves artificial intelligence came as a revolution havoc when it first came to the light the concept of making machines does work on their own like granting machines the intelligence the idea of making machines work like humans came back in the 19s back then pe

In [58]:
def remove_stop_words(text, stop_words):
    words = text.split()
    filtered_words = []

    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    filtered_text = " ".join(filtered_words)
    
    return filtered_text

text = remove_stop_words(text, stop_words)

In [59]:
text

'human minds fascination carrying potential tinkering nature pixie dust intelligence creating solving mysteries wonders admiration matter captivating human mind appalled hunger unravel limitations pure greed humans stopped evolving intelligence makes supreme intelligence calls supremacy evolve opposed challenge human minds capabilities making question importance artificial intelligence revolution havoc concept making machines work granting machines intelligence idea making machines work humans 19s people n’t thing making living thing work tasks mention surpass humans skills facts 1997 greatest chess kasparov defeated chess game machine top skilled human lost mere machine created ’ve defeated betterment skills granted supremacy machines tools equipment helped unskilled mind intelligence creates skilled work perfection precision initially time passed humans drawn puzzle lot changed human research deeper deeper result machines evolved present machines growing develops improves part indust

In [87]:
# Tokenize the text

import nltk
nltk.download('punkt')

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

tokens = tokenize_text(text)

[nltk_data] Downloading package punkt to /home/ashis-
[nltk_data]     solomon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [88]:
tokens

['human',
 'minds',
 'fascination',
 'carrying',
 'potential',
 'tinkering',
 'nature',
 'pixie',
 'dust',
 'intelligence',
 'creating',
 'solving',
 'mysteries',
 'wonders',
 'admiration',
 'matter',
 'captivating',
 'human',
 'mind',
 'appalled',
 'hunger',
 'unravel',
 'limitations',
 'pure',
 'greed',
 'humans',
 'stopped',
 'evolving',
 'intelligence',
 'makes',
 'supreme',
 'intelligence',
 'calls',
 'supremacy',
 'evolve',
 'opposed',
 'challenge',
 'human',
 'minds',
 'capabilities',
 'making',
 'question',
 'importance',
 'artificial',
 'intelligence',
 'revolution',
 'havoc',
 'concept',
 'making',
 'machines',
 'work',
 'granting',
 'machines',
 'intelligence',
 'idea',
 'making',
 'machines',
 'work',
 'humans',
 '19s',
 'people',
 'n',
 '’',
 't',
 'thing',
 'making',
 'living',
 'thing',
 'work',
 'tasks',
 'mention',
 'surpass',
 'humans',
 'skills',
 'facts',
 '1997',
 'greatest',
 'chess',
 'kasparov',
 'defeated',
 'chess',
 'game',
 'machine',
 'top',
 'skilled',
 'h

### Creating a dictionary of Positive and Negative words

In [67]:
with open('MasterDictionary/positive-words.txt') as f:
    positive_list = f.read().splitlines()
    
with open('MasterDictionary/negative-words.txt') as f:
    negative_list = f.read().splitlines()

In [68]:
def remove_stop_words_fromList(temp_list, stop_words):
    filtered_list = []
    for element in temp_list:
        if element not in stop_words:
            filtered_list.append(element)
    return filtered_list

positive_list = remove_stop_words_fromList(positive_list, stop_words)
negative_list = remove_stop_words_fromList(negative_list, stop_words)

In [69]:
positive_list

['a+',
 'abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation',
 'accolade',
 'accolades',
 'accommodative',
 'accomodative',
 'accomplish',
 'accomplished',
 'accomplishment',
 'accomplishments',
 'accurate',
 'accurately',
 'achievable',
 'achievement',
 'achievements',
 'achievible',
 'acumen',
 'adaptable',
 'adaptive',
 'adequate',
 'adjustable',
 'admirable',
 'admirably',
 'admiration',
 'admire',
 'admirer',
 'admiring',
 'admiringly',
 'adorable',
 'adore',
 'adored',
 'adorer',
 'adoring',
 'adoringly',
 'adroit',
 'adroitly',
 'adulate',
 'adulation',
 'adulatory',
 'advanced',
 'advantage',
 'advantageous',
 'advantageously',
 'advantages',
 'adventuresome',
 'adventurous',
 'advocate',
 'advocated',
 'advocates',
 'affability',
 'affable',
 'affably',
 'affectation',
 'affection',
 'affectionate',
 'affinity',
 'affirm',
 'affirmation',
 'affirmative',
 'affluence',
 'affluent',
 'afford',
 'affordable',
 'af

In [70]:
negative_list

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted',
 'aborts',
 'abrade',
 'abrasive',
 'abrupt',
 'abruptly',
 'abscond',
 'absence',
 'absent-minded',
 'absentee',
 'absurd',
 'absurdity',
 'absurdly',
 'absurdness',
 'abuse',
 'abused',
 'abuses',
 'abusive',
 'abysmal',
 'abysmally',
 'abyss',
 'accidental',
 'accost',
 'accursed',
 'accusation',
 'accusations',
 'accuse',
 'accuses',
 'accusing',
 'accusingly',
 'acerbate',
 'acerbic',
 'acerbically',
 'ache',
 'ached',
 'aches',
 'achey',
 'aching',
 'acrid',
 'acridly',
 'acridness',
 'acrimonious',
 'acrimoniously',
 'acrimony',
 'adamant',
 'adamantly',
 'addict',
 'addicted',
 'addicting',
 'addicts',
 'admonish',
 'admonisher',
 'admonishingly',
 'admonishment',
 'admonition',
 'adulterate',
 'adulterated',
 'adulteration',
 'adulterier',
 'adversarial',
 'adversary',
 'adverse',
 'adversity',
 'afflict',
 'affliction',
 'afflictive',
 'affront',


### Extracting Derived variables

In [72]:
# The text is already tokenized

In [89]:
tokens

['human',
 'minds',
 'fascination',
 'carrying',
 'potential',
 'tinkering',
 'nature',
 'pixie',
 'dust',
 'intelligence',
 'creating',
 'solving',
 'mysteries',
 'wonders',
 'admiration',
 'matter',
 'captivating',
 'human',
 'mind',
 'appalled',
 'hunger',
 'unravel',
 'limitations',
 'pure',
 'greed',
 'humans',
 'stopped',
 'evolving',
 'intelligence',
 'makes',
 'supreme',
 'intelligence',
 'calls',
 'supremacy',
 'evolve',
 'opposed',
 'challenge',
 'human',
 'minds',
 'capabilities',
 'making',
 'question',
 'importance',
 'artificial',
 'intelligence',
 'revolution',
 'havoc',
 'concept',
 'making',
 'machines',
 'work',
 'granting',
 'machines',
 'intelligence',
 'idea',
 'making',
 'machines',
 'work',
 'humans',
 '19s',
 'people',
 'n',
 '’',
 't',
 'thing',
 'making',
 'living',
 'thing',
 'work',
 'tasks',
 'mention',
 'surpass',
 'humans',
 'skills',
 'facts',
 '1997',
 'greatest',
 'chess',
 'kasparov',
 'defeated',
 'chess',
 'game',
 'machine',
 'top',
 'skilled',
 'h

In [92]:
def get_sentiment_scores(tokens):

    # Calculate positive and negative scores
    pos_score = sum([1 for token in tokens if token in positive_list])
    neg_score = sum([-1 for token in tokens if token in negative_list]) * -1
    
    # Calculate polarity score
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    
    # Calculate the subjectivity score
    total_words = len(tokens)
    subjectivity_score = (pos_score + neg_score) / (total_words + 0.000001)
    
    # Return scores as a dictionary
    scores = {
        'positive_score': pos_score,
        'negative_score': neg_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score
    }
    return scores


In [143]:
sentiment_scores = get_sentiment_scores(tokens)

In [144]:
sentiment_scores

{'positive_score': 58,
 'negative_score': 37,
 'polarity_score': 0.22105262925207758,
 'subjectivity_score': 0.16183986343809223}

### Analysis of Readability

In [112]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

def calculate_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stop_words]

    # Calculate average sentence length
    avg_sentence_length = len(words) / len(sentences)

    # Calculate percentage of complex words
    complex_words = [word for word in words if len(word) > 2 and len(set(word)) >= 2]
    pct_complex_words = len(complex_words) / len(words)

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + pct_complex_words)

    # Return dictionary of results
    results = {'avg_sentence_length': avg_sentence_length,
               'pct_complex_words': pct_complex_words,
               'fog_index': fog_index}

    return results


In [138]:
readability_vars_1 = calculate_readability(article_text)

In [139]:
readability_vars_1

{'avg_sentence_length': 9.632911392405063,
 'pct_complex_words': 0.7135348226018396,
 'fog_index': 4.1385784860027615}

### Average Number of Words Per Sentence

In [114]:
def average_words_per_sentence(text):
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    avg_words_per_sentence = len(words) / len(sentences)
    return avg_words_per_sentence

average_words_per_sentence(article_text)

20.620253164556964

### Complex Word Count

In [120]:
def complex_word_count(text):
    words = nltk.word_tokenize(text.lower())
    syllable_counts = syllable_count_per_word(text)
    complex_word_count = 0
    for i in range(len(words)):
        if syllable_counts[i] > 2:
            complex_word_count += 1
    return complex_word_count

complex_word_count(article_text)

184

### Word Count

In [115]:
from nltk.corpus import stopwords
import string

def word_count(text):
    words = nltk.word_tokenize(text.lower())
    words = [word for word in words if word not in stopwords.words('english')]
    words = [word for word in words if word not in string.punctuation]
    return len(words)

word_count(article_text)

768

### Syllable Count

In [133]:
def syllable_count_per_word(text):
    words = nltk.word_tokenize(text.lower())
    syllable_counts = []
    for word in words:
        vowel_count = 0
        prev_char = ''
        for char in word:
            if char in 'aeiouy' and prev_char not in 'aeiouy':
                vowel_count += 1
            prev_char = char
        if word.endswith('es') or word.endswith('ed'):
            vowel_count -= 1
        if vowel_count == 0:
            vowel_count = 1
        syllable_counts.append(vowel_count)
    return syllable_counts

def avg_syllable_count_per_word(text):
    words = nltk.word_tokenize(text.lower())
    syllable_counts = []
    for word in words:
        vowel_count = 0
        prev_char = ''
        for char in word:
            if char in 'aeiouy' and prev_char not in 'aeiouy':
                vowel_count += 1
            prev_char = char
        if word.endswith('es') or word.endswith('ed'):
            vowel_count -= 1
        if vowel_count == 0:
            vowel_count = 1
        syllable_counts.append(vowel_count)
    return sum(syllable_counts)/len(syllable_counts)


syl_list = syllable_count_per_word(article_text)

In [126]:
sum(syl_list)/len(syl_list)

1.452088452088452

### Personal Pronouns

In [127]:

def count_personal_pronouns(text):
    pattern = r"\b(I|we|my|ours|us)\b"
    matches = re.findall(pattern, text)

    # Remove matches that are the country name "US"
    matches = [match for match in matches if match != 'US']
 
    count = len(matches)
    return count

In [128]:
count_personal_pronouns(article_text)

6

### Avg Word Length

In [129]:
def calculate_avg_word_length(text):
    words = re.findall(r'\b\w+\b', text)
    sum_char_lengths = sum(len(word) for word in words)
    total_words = len(words)
    avg_word_length = sum_char_lengths / total_words

    return avg_word_length

In [130]:
calculate_avg_word_length(article_text)

4.716376306620209

### Readability Vars

In [141]:
readability_vars_2 = {
    'average_words_per_sentence': average_words_per_sentence(article_text),
    'complex_word_count': complex_word_count(article_text),
    'word_count': word_count(article_text),
    'avg_syllable_count_per_word': avg_syllable_count_per_word(article_text),
    'count_personal_pronouns': count_personal_pronouns(article_text),
    'calculate_avg_word_length': calculate_avg_word_length(article_text)
}

In [142]:
readability_vars_2

{'average_words_per_sentence': 20.620253164556964,
 'complex_word_count': 184,
 'word_count': 768,
 'avg_syllable_count_per_word': 1.452088452088452,
 'count_personal_pronouns': 6,
 'calculate_avg_word_length': 4.716376306620209}

In [145]:
sentiment_scores

{'positive_score': 58,
 'negative_score': 37,
 'polarity_score': 0.22105262925207758,
 'subjectivity_score': 0.16183986343809223}

In [148]:
readability_vars_1

{'avg_sentence_length': 9.632911392405063,
 'pct_complex_words': 0.7135348226018396,
 'fog_index': 4.1385784860027615}

In [149]:
readability_vars_2

{'average_words_per_sentence': 20.620253164556964,
 'complex_word_count': 184,
 'word_count': 768,
 'avg_syllable_count_per_word': 1.452088452088452,
 'count_personal_pronouns': 6,
 'calculate_avg_word_length': 4.716376306620209}

### Result List

In [150]:
res_list = [
    sentiment_scores['positive_score'],
    sentiment_scores['negative_score'],
    sentiment_scores['polarity_score'],
    sentiment_scores['subjectivity_score'],
    readability_vars_1['avg_sentence_length'],
    readability_vars_1['pct_complex_words'],
    readability_vars_1['fog_index'],
    readability_vars_2['average_words_per_sentence'],
    readability_vars_2['complex_word_count'],
    readability_vars_2['word_count'],
    readability_vars_2['avg_syllable_count_per_word'],
    readability_vars_2['count_personal_pronouns'],
    readability_vars_2['calculate_avg_word_length']
]

In [151]:
res_list

[58,
 37,
 0.22105262925207758,
 0.16183986343809223,
 9.632911392405063,
 0.7135348226018396,
 4.1385784860027615,
 20.620253164556964,
 184,
 768,
 1.452088452088452,
 6,
 4.716376306620209]

In [152]:
# In the next notebook [NB_2], I will make the process modular and iterate through the urls.