# Web Scrapping

In [None]:
import requests
import html5lib
import bs4
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import chardet
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('punkt')
import string
from nltk.corpus import cmudict
nltk.download('cmudict')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
input = pd.read_excel('/content/drive/MyDrive/Blackcoffer/Input.xlsx')

In [None]:
URL = "https://insights.blackcoffer.com/online-gaming-adolescent-online-gaming-effects-demotivated-depression-musculoskeletal-and-psychosomatic-symptoms/"
raw_html = requests.get(URL)

soup = BeautifulSoup(raw_html.content, 'html5lib')

In [None]:
# removing unnecessary tags - script/style/footer/head
for i in soup.find_all(['script','style','footer','head']):
    i.decompose()

In [None]:
l = []
# similarly to get all the occurrences of a given tag
for text in soup.find('body').contents:
	txt = text.get_text()
	l.append(txt)

parsed_txt = '.'.join(l) # list to string
parsed_txt



In [None]:
cleaned_txt = re.sub('\s+',' ',parsed_txt) # remove extra white spaces \s \n \t
cleaned_txt.strip()



In [None]:
header_split_pattern = 'By Ajay Bidyarthy -'
footer_split_pattern = 'Blackcoffer Insights'

In [None]:
# Split at endpoints of the main body text
# endpoint 1 : By Ajay Bidyarthy -   # Before this all text is header
# endpoint 2 : Blackcoffer Insights  # After this all text is footer

header_split_text = re.split(header_split_pattern,cleaned_txt)
body_text_without_header = header_split_text[-1]

footer_split_text = re.split(footer_split_pattern,body_text_without_header)
main_text = footer_split_text[0].strip() # main body text without header and footer (Article Title and Text)

In [None]:
main_text



In [None]:
# removing date and views which are
# at the beginning of article text
article_text = re.sub(r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+\d+','',main_text)
article_text



In [None]:
# extracting title through h1
article_title = soup.find('h1').text
article_title

'Online gaming: Adolescent online gaming effects demotivated, depression, musculoskeletal, and psychosomatic symptoms.'

In [None]:
# Whole article text
whole_text = article_title+article_text

In [None]:
whole_text



In [None]:
# Specify the file path
file_path = "example.txt"

# Open the file in write mode ('w')
with open(file_path, 'w') as file:
    # Write the text to the file
    file.write(whole_text)



---



---



In [None]:
output_folder = '/content/drive/MyDrive/Blackcoffer/Extracted Text/'

In [None]:
header_split_pattern = 'By Ajay Bidyarthy -'
footer_split_pattern = 'Blackcoffer Insights'

for i in range(len(input)):
    file_name = input['URL_ID'][i]
    URL = input['URL'][i]

    raw_html = requests.get(URL)
    if raw_html.status_code == 404:
        print(f"{URL} with {file_name} URL_ID doesn't exist")
        continue

    soup = BeautifulSoup(raw_html.content, 'html5lib')
    # removing unnecessary tags - script/style/footer/head
    for i in soup.find_all(['script','style','footer','head']):
        i.decompose()

    l = []
    # similarly to get all the occurrences of a given tag
    for text in soup.find('body').contents:
        txt = text.get_text()
        l.append(txt)

    parsed_txt = '.'.join(l) # list to string

    cleaned_txt = re.sub('\s+',' ',parsed_txt) # remove extra white spaces \s \n \t
    cleaned_txt.strip()

    # Split at endpoints of the main body text
    # endpoint 1 : By Ajay Bidyarthy -   # Before this all text is header
    # endpoint 2 : Blackcoffer Insights  # After this all text is footer

    header_split_text = re.split(header_split_pattern,cleaned_txt)
    body_text_without_header = header_split_text[-1]

    footer_split_text = re.split(footer_split_pattern,body_text_without_header)
    main_text = footer_split_text[0].strip() # main body text without header and footer (Article Text)

    # removing date and views which are at the
    # beginning of article text right after the title
    article_text = re.sub(r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+\d+','',main_text)

    # extracting title through h1
    article_title = soup.find('h1').text

    punctuation_title = article_title[-1]

    # Whole article text
    if punctuation_title == '.' or punctuation_title == '?' or punctuation_title == '!':
        whole_text = article_title + article_text
    else:
        whole_text = article_title + '.' + article_text

    # Specify the file path
    file_path = os.path.join(output_folder,f"{file_name}.txt")

    # Open the file in write mode ('w')
    with open(file_path, 'w') as file:
        # Write the text to the file
        file.write(whole_text)

https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ with blackassign0036 URL_ID doesn't exist
https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ with blackassign0049 URL_ID doesn't exist


In [None]:
headers_not_trimmed = [14,20,29,43,92,99,100]

In [None]:
header_split_pattern = 'By Ajay Bidyarthy'
footer_split_pattern = 'Blackcoffer Insights'

for i in headers_not_trimmed:
    file_name = input['URL_ID'][i-1]
    URL = input['URL'][i-1]

    print(file_name)

    raw_html = requests.get(URL)
    if raw_html.status_code == 404:
        print(f"{URL} with {file_name} URL_ID doesn't exist")
        continue

    soup = BeautifulSoup(raw_html.content, 'html5lib')
    # removing unnecessary tags - script/style/footer/head
    for i in soup.find_all(['script','style','footer','head']):
        i.decompose()

    l = []
    # similarly to get all the occurrences of a given tag
    for text in soup.find('body').contents:
        txt = text.get_text()
        l.append(txt)

    parsed_txt = '.'.join(l) # list to string

    cleaned_txt = re.sub('\s+',' ',parsed_txt) # remove extra white spaces \s \n \t
    cleaned_txt.strip()

    # Split at endpoints of the main body text
    # endpoint 1 : By Ajay Bidyarthy     # Before this all text is header
    # endpoint 2 : Blackcoffer Insights  # After this all text is footer

    header_split_text = re.split(header_split_pattern,cleaned_txt)
    body_text_without_header = header_split_text[-1]

    footer_split_text = re.split(footer_split_pattern,body_text_without_header)
    main_text = footer_split_text[0].strip() # main body text without header and footer (Article Text)

    # removing date, views and social media handles which
    # are at the beginning of article text right after the title
    article_text = re.sub(r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+\d+\s+\d+\s+Share FacebookTwitterPinterestWhatsApp','',main_text)

    # extracting title through h1
    article_title = soup.find('h1').text

    punctuation_title = article_title[-1]

    # Whole article text
    if punctuation_title == '.' or punctuation_title == '?' or punctuation_title == '!':
        whole_text = article_title + article_text
    else:
        whole_text = article_title + '.' + article_text

    # Specify the file path
    file_path = os.path.join(output_folder,f"{file_name}.txt")

    # Open the file in write mode ('w')
    with open(file_path, 'w') as file:
        # Write the text to the file
        file.write(whole_text)

blackassign0014
blackassign0020
blackassign0029
blackassign0043
blackassign0092
blackassign0099
blackassign0100


In [None]:
footers_not_trimmed = [46,47,93]

In [None]:
header_split_pattern = 'By Ajay Bidyarthy -'
footer_split_pattern = 'RELATED ARTICLESMORE'

for i in footers_not_trimmed:
    file_name = input['URL_ID'][i-1]
    URL = input['URL'][i-1]

    print(file_name)

    raw_html = requests.get(URL)
    if raw_html.status_code == 404:
        print(f"{URL} with {file_name} URL_ID doesn't exist")
        continue

    soup = BeautifulSoup(raw_html.content, 'html5lib')
    # removing unnecessary tags - script/style/footer/head
    for i in soup.find_all(['script','style','footer','head']):
        i.decompose()

    l = []
    # similarly to get all the occurrences of a given tag
    for text in soup.find('body').contents:
        txt = text.get_text()
        l.append(txt)

    parsed_txt = '.'.join(l) # list to string

    cleaned_txt = re.sub('\s+',' ',parsed_txt) # remove extra white spaces \s \n \t
    cleaned_txt.strip()

    # Split at endpoints of the main body text
    # endpoint 1 : By Ajay Bidyarthy -   # Before this all text is header
    # endpoint 2 : RELATED ARTICLESMORE  # After this all text is footer

    header_split_text = re.split(header_split_pattern,cleaned_txt)
    body_text_without_header = header_split_text[-1]

    footer_split_text = re.split(footer_split_pattern,body_text_without_header)
    main_text = footer_split_text[0].strip() # main body text without header and footer (Article Text)

    # removing date and views which are at the
    # beginning of article text right after the title
    article_text = re.sub(r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+\d+','',main_text)

    # extracting title through h1
    article_title = soup.find('h1').text

    punctuation_title = article_title[-1]

    # Whole article text
    if punctuation_title == '.' or punctuation_title == '?' or punctuation_title == '!':
        whole_text = article_title + article_text
    else:
        whole_text = article_title + '.' + article_text

    # Specify the file path
    file_path = os.path.join(output_folder,f"{file_name}.txt")

    # Open the file in write mode ('w')
    with open(file_path, 'w') as file:
        # Write the text to the file
        file.write(whole_text)

blackassign0046
blackassign0047
blackassign0093


In [None]:
headers_and_footers_not_trimmed = [83,84]

In [None]:
header_split_pattern = 'Share FacebookTwitterPinterestWhatsApp'
footer_split_pattern = 'Share FacebookTwitterPinterestWhatsApp'

for i in headers_and_footers_not_trimmed:
    file_name = input['URL_ID'][i-1]
    URL = input['URL'][i-1]

    print(file_name)

    raw_html = requests.get(URL)
    if raw_html.status_code == 404:
        print(f"{URL} with {file_name} URL_ID doesn't exist")
        continue

    soup = BeautifulSoup(raw_html.content, 'html5lib')
    # removing unnecessary tags - script/style/footer/head
    for i in soup.find_all(['script','style','footer','head']):
        i.decompose()

    l = []
    # similarly to get all the occurrences of a given tag
    for text in soup.find('body').contents:
        txt = text.get_text()
        l.append(txt)

    parsed_txt = '.'.join(l) # list to string

    cleaned_txt = re.sub('\s+',' ',parsed_txt) # remove extra white spaces \s \n \t
    cleaned_txt.strip()

    # Split at endpoints of the main body text
    # endpoint 1 : Share FacebookTwitterPinterestWhatsApp  # Before this all text is header
    # endpoint 2 : Share FacebookTwitterPinterestWhatsApp  # After this all text is footer

    header_split_text = re.split(header_split_pattern,cleaned_txt)
    body_text_without_header = header_split_text[1]

    footer_split_text = re.split(footer_split_pattern,body_text_without_header)
    main_text = footer_split_text[0].strip() # main body text without header and footer (Article Text)

    # no text in the beginning of article
    article_text = re.sub(r'','',main_text)

    # extracting title through h1
    article_title = soup.find('h1').text

    punctuation_title = article_title[-1]

    # Whole article text
    if punctuation_title == '.' or punctuation_title == '?' or punctuation_title == '!':
        whole_text = article_title + article_text
    else:
        whole_text = article_title + '.' + article_text

    # Specify the file path
    file_path = os.path.join(output_folder,f"{file_name}.txt")

    # Open the file in write mode ('w')
    with open(file_path, 'w') as file:
        # Write the text to the file
        file.write(whole_text)

blackassign0083
blackassign0084




---



---



---



In [None]:
def Extract_Text(header_split_pattern,footer_split_pattern,begin_clean_pattern,file_index_list,input,code):

    for i in file_index_list:
        file_name = input['URL_ID'][i-1]
        URL = input['URL'][i-1]

        print(file_name)

        raw_html = requests.get(URL)
        if raw_html.status_code == 404:
            print(f"{URL} with {file_name} URL_ID doesn't exist")
            continue

        soup = BeautifulSoup(raw_html.content, 'html5lib')
        # removing unnecessary tags - script/style/footer/head
        for i in soup.find_all(['script','style','footer','head']):
            i.decompose()

        l = []
        # similarly to get all the occurrences of a given tag
        for text in soup.find('body').contents:
            txt = text.get_text()
            l.append(txt)

        parsed_txt = '.'.join(l) # list to string

        cleaned_txt = re.sub('\s+',' ',parsed_txt) # remove extra white spaces \s \n \t
        cleaned_txt.strip()

        # Split at endpoints of the main body text
        # endpoint 1 : header_split_pattern  # Before this all text is header
        # endpoint 2 : footer_split_pattern  # After this all text is footer

        header_split_text = re.split(header_split_pattern,cleaned_txt)
        if code == 1: # Both header & footer diff from common pattern
            body_text_without_header = header_split_text[1]
        else:
            body_text_without_header = header_split_text[-1]

        footer_split_text = re.split(footer_split_pattern,body_text_without_header)
        main_text = footer_split_text[0].strip() # main body text without header and footer (Article Text)

        # removing date and views which are at the
        # beginning of article text right after the title
        article_text = re.sub(begin_clean_pattern,'',main_text)

        # extracting title through h1
        article_title = soup.find('h1').text

        punctuation_title = article_title[-1]

        # Whole article text
        if punctuation_title == '.' or punctuation_title == '?' or punctuation_title == '!':
            whole_text = article_title + article_text
        else:
            whole_text = article_title + '.' + article_text

        # Specify the file path
        file_path = os.path.join(output_folder,f"{file_name}.txt")

        # Open the file in write mode ('w')
        with open(file_path, 'w') as file:
            # Write the text to the file
            file.write(whole_text)

In [None]:
# code == 0 : General pattern/Header different from general/Footer different from general
# code == 1 : Both Header and Footer different from general

In [None]:
# General or common pattern

file_indx_list = [i for i in range(1,len(input)+1)]
header_split_pattern = 'By Ajay Bidyarthy -'
footer_split_pattern = 'Blackcoffer Insights'
begin_clean_pattern = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+\d+'

In [None]:
Extract_Text(header_split_pattern,footer_split_pattern,begin_clean_pattern,file_indx_list,input,0)

blackassign0001
blackassign0002
blackassign0003
blackassign0004
blackassign0005
blackassign0006
blackassign0007
blackassign0008
blackassign0009
blackassign0010
blackassign0011
blackassign0012
blackassign0013
blackassign0014
blackassign0015
blackassign0016
blackassign0017
blackassign0018
blackassign0019
blackassign0020
blackassign0021
blackassign0022
blackassign0023
blackassign0024
blackassign0025
blackassign0026
blackassign0027
blackassign0028
blackassign0029
blackassign0030
blackassign0031
blackassign0032
blackassign0033
blackassign0034
blackassign0035
blackassign0036
https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ with blackassign0036 URL_ID doesn't exist
blackassign0037
blackassign0038
blackassign0039
blackassign0040
blackassign0041
blackassign0042
blackassign0043
blackassign0044
blackassign0045
blackassign0046
blackassign0047
blackassign0048
blackassign0049
https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-fut

In [None]:
# For the URLs with URL_ID's - 14,20,29,43,92,99,100
# Headers are different from common pattern

headers_not_trimmed = [14,20,29,43,92,99,100]
header_split_pattern = 'By Ajay Bidyarthy'
footer_split_pattern = 'Blackcoffer Insights'
begin_clean_pattern = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+\d+\s+\d+\s+Share FacebookTwitterPinterestWhatsApp'

In [None]:
Extract_Text(header_split_pattern,footer_split_pattern,begin_clean_pattern,headers_not_trimmed,input,0)

blackassign0014
blackassign0020
blackassign0029
blackassign0043
blackassign0092
blackassign0099
blackassign0100


In [None]:
# For the URLs with URL_ID's - 46,47,93
# Footers are different from common pattern

footers_not_trimmed = [46,47,93]
header_split_pattern = 'By Ajay Bidyarthy -'
footer_split_pattern = 'RELATED ARTICLESMORE'
begin_clean_pattern = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\s+\d+'

In [None]:
Extract_Text(header_split_pattern,footer_split_pattern,begin_clean_pattern,footers_not_trimmed,input,0)

blackassign0046
blackassign0047
blackassign0093


In [None]:
# For the URLs with URL_ID's - 83,84
# Both Headers & Footers are different from common pattern

headers_and_footers_not_trimmed = [83,84]
header_split_pattern = 'Share FacebookTwitterPinterestWhatsApp'
footer_split_pattern = 'Share FacebookTwitterPinterestWhatsApp'
begin_clean_pattern = r''

In [None]:
Extract_Text(header_split_pattern,footer_split_pattern,begin_clean_pattern,headers_and_footers_not_trimmed,input,1)

blackassign0083
blackassign0084




---



---



# Sentiment Analysis

In [None]:
MasterDictionary = '/content/drive/MyDrive/Blackcoffer/MasterDictionary/'
StopWords_path = '/content/drive/MyDrive/Blackcoffer/StopWords/'

In [None]:
os.listdir(MasterDictionary)

['negative-words.txt', 'positive-words.txt']

In [None]:
positive_words_list_path = os.path.join(MasterDictionary,'positive-words.txt')
negative_words_list_path = os.path.join(MasterDictionary,'negative-words.txt')

In [None]:
positive = set()
negative = set()

In [None]:
def make_set(file_path,set_name):
    # Open the file in read mode ('r')
    with open(file_path, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']

    with open(file_path, 'r', encoding=encoding) as f:
        for word_newline in f:
            word_newline = word_newline.lower()
            word = re.sub('\n','',word_newline)
            set_name.add(word)

In [None]:
make_set(positive_words_list_path,positive)

In [None]:
make_set(negative_words_list_path,negative)

In [None]:
def find_stopWords(stopwords_path):
    set_stopWords = set()

    for stop_word_file in os.listdir(stopwords_path):
        stop_word_file_path = os.path.join(stopwords_path,stop_word_file)
        # print(stop_word_file)

        with open(stop_word_file_path, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']

        with open(stop_word_file_path, 'r', encoding=encoding) as f:
            for word_newline in f:
                word_newline = re.split('\|',word_newline)[0].strip()
                word_newline = word_newline.lower()
                word = re.sub('\n','',word_newline)
                set_stopWords.add(word)

    return set_stopWords

In [None]:
set_stopWords = find_stopWords(StopWords_path)

In [None]:
positive_dictionary = positive.difference(positive.intersection(set_stopWords))

In [None]:
negative_dictionary = negative.difference(negative.intersection(set_stopWords))

In [None]:
with open('/content/drive/MyDrive/Blackcoffer/Extracted Text/blackassign0037.txt', 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']

with open('/content/drive/MyDrive/Blackcoffer/Extracted Text/blackassign0037.txt','r',encoding=encoding) as file:
    text = file.read()

Word and Sentence Tokens

In [None]:
def word_tokenize(text):
    # Removing punctuations by splitting
    # at any of below characters .,!&?;:\-\s()
    pattern = r"[.,!&?;:\-\s()\"]+"

    # Tokenize the text
    word_tokens = re.split(pattern, text)

    # Remove empty tokens
    word_tokens = [token for token in word_tokens if token]
    return word_tokens

In [None]:
word_tokens = word_tokenize(text)
sent_tokens = sent_tokenize(text, language='english')
sent_count = len(sent_tokens)
word_tokens_count = len(word_tokens)

In [None]:
word_tokens_count,sent_count

(732, 36)

Positive and Negative score

In [None]:
def pos_and_neg_score(tokens_list):
    positive_score,negative_score = 0,0
    for token in tokens_list:
        if token.lower() in positive_dictionary:
            positive_score += 1

        elif token.lower() in negative_dictionary:
            negative_score -= 1

    negative_score *= -1
    return positive_score,negative_score

In [None]:
positive_score,negative_score = pos_and_neg_score(word_tokens)

In [None]:
positive_score,negative_score

(32, 13)

Polarity score

In [None]:
def polarity_score(positive_score,negative_score):
    polarity_Score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)
    return polarity_Score

In [None]:
polarity_score(positive_score,negative_score)

0.4222222128395064

Word count (cleaned words)

In [None]:
def clean_words(word_tokens):
    from nltk.corpus import stopwords
    stopwords_nltk = stopwords.words('english')

    # Punctuations already removed
    # Stopword removal using NLTK
    words_cleaned = [w for w in word_tokens if (w not in stopwords_nltk)]

    words_count = len(words_cleaned)
    return words_count

In [None]:
words_count = clean_words(word_tokens)
words_count

446

Subjectivity score

In [None]:
def subjectivity_score(positive_score,negative_score,words_count):
    subjectivity_Score = (positive_score + negative_score)/(words_count + 0.000001)
    return subjectivity_Score

In [None]:
subjectivity_score(positive_score,negative_score,words_count)

0.10089686076032094

Average sentence length / Average number of words per sentence

In [None]:
def average_sentence_length(word_tokens_count,sent_count):
    average_sent_length = word_tokens_count/sent_count
    return average_sent_length

In [None]:
avg_sentence_length = average_sentence_length(word_tokens_count,sent_count)
avg_no_of_words_per_sentence = average_sentence_length(word_tokens_count,sent_count)

In [None]:
avg_sentence_length

20.333333333333332

Syllabe count & Complex words

In [None]:
syllable_dict = cmudict.dict()

In [None]:
def count_syllables(word):
    if word.lower() not in syllable_dict:    # search for lower case version of the word in dictionary
        return 0
    return [len(list(y for y in x if y[-1].isdigit())) for x in syllable_dict[word.lower()]][0]
                                               # return number of syllable

def is_complex(word):
    syllable_count = count_syllables(word)
    return syllable_count > 2

def count_complex_words(words):
    total_syllable = 0
    for word in words:
        word = word.lower()
        total_syllable += count_syllables(word)

    num_complex_words = sum(is_complex(word) for word in words)
    return total_syllable,num_complex_words

In [None]:
total_syllable_count,complex_word_count = count_complex_words(word_tokens)
total_syllable_count,complex_word_count

(1078, 87)

Percentage of Complex words

In [None]:
def percent_complex_words(complex_word_count,word_tokens_count):
    percent_of_complex_words = complex_word_count/word_tokens_count
    return percent_of_complex_words

In [None]:
percent_of_complex_words = percent_complex_words(complex_word_count,word_tokens_count)
percent_of_complex_words

0.11885245901639344

Fog index

In [None]:
def fog_index(average_sentence_length,percent_of_complex_words):
    Fog_index = 0.4*(average_sentence_length + percent_of_complex_words)
    return Fog_index

In [None]:
fog_index(avg_sentence_length,percent_of_complex_words)

8.18087431693989

Personal Pronouns

In [None]:
def count_personal_pronouns(word_tokens):
    total_personal_pronouns = 0
    for word in word_tokens:
        pattern = r"\b(I|we|my|ours|us)\b"   # pattern to check if those words exists
        pattern = r"(?<!\bUS\b)" + pattern   # pattern should not include US instead of us
        matches = re.findall(pattern, word, flags=re.IGNORECASE)
        total_personal_pronouns += len(matches)
    return total_personal_pronouns

In [None]:
count_personal_pronouns(word_tokens)

2

Average Word Length

In [None]:
def average_word_length(word_tokens):
    word_length = 0

    for word in word_tokens:
        word_length += len(word)

    avg_word_length = word_length/len(word_tokens)

    return avg_word_length

In [None]:
average_word_length(word_tokens)

4.796448087431694

In [None]:
Extracted_text_path = '/content/drive/MyDrive/Blackcoffer/Extracted Text/'

In [None]:
syllable_dict = cmudict.dict()

for i in range(len(input)):
    file_name = input['URL_ID'][i]
    URL = input['URL'][i]

    raw_html = requests.get(URL)
    if raw_html.status_code == 404:
        print(f"{URL} with {file_name} URL_ID doesn't exist")
        continue

    text_file_path = os.path.join(Extracted_text_path,file_name+'.txt')

    with open(text_file_path, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']

    # Open the file in read mode ('r')
    with open(text_file_path,'r',encoding=encoding) as file:
        text = file.read()

    # Tokenization of words and sentences
    word_tokens = word_tokenize(text)
    sent_tokens = sent_tokenize(text, language='english')

    # Count of word and sentence tokens
    sent_count = len(sent_tokens)
    word_tokens_count = len(word_tokens)

    # Count of cleaned words (after removal of stopwords and punctuations)
    words_count = clean_words(word_tokens)

    # Positive and Negative Scores
    positive_Score,negative_Score = pos_and_neg_score(word_tokens)

    # Polarity Score
    polarity_Score = round(polarity_score(positive_Score,negative_Score),4)

    # Subjectivity Score
    subjectivity_Score = round(subjectivity_score(positive_Score,negative_Score,words_count),4)

    # Average sentence length
    avg_sentence_length = round(average_sentence_length(word_tokens_count,sent_count),4)

    # Average no of words per sentence
    avg_no_of_words_per_sentence = round(average_sentence_length(word_tokens_count,sent_count),4)

    # Syllable count and Complex word count
    total_syllable_count, complex_word_count = count_complex_words(word_tokens)

    # Percentage of complex words
    percent_of_complex_words = round(percent_complex_words(complex_word_count,word_tokens_count),4)

    # Fog Index
    fog_Index = round(fog_index(avg_sentence_length,percent_of_complex_words),4)

    # Personal Pronouns
    total_Personal_Pronouns = count_personal_pronouns(word_tokens)

    # Average word length
    avg_word_length = round(average_word_length(word_tokens),4)

    print(f'''File: {file_name+'.txt'},
              Positive Score: {positive_Score}, Negative Score: {negative_Score}, Polarity Score: {polarity_Score}, Subjectivity Score: {subjectivity_Score},
              Average sentence length: {avg_sentence_length}, Percentage of complex words: {percent_of_complex_words}, Fog Index: {fog_Index},
              Average no of words per sentence:{avg_no_of_words_per_sentence}, Complex Word count: {complex_word_count}, Word Count: {words_count},
              Syllable count: {total_syllable_count}, Personal Pronouns: {total_Personal_Pronouns}, Average Word length: {avg_word_length}
            ''')

File: blackassign0001.txt,
              Positive Score: 27, Negative Score: 6, Polarity Score: 0.6364, Subjectivity Score: 0.0445,
              Average sentence length: 15.7564, Percentage of complex words: 0.1229, Fog Index: 6.3517, 
              Average no of words per sentence:15.7564, Complex Word count: 151, Word Count: 742, 
              Syllable count: 1826, Personal Pronouns: 12, Average Word length: 4.5574
            
File: blackassign0002.txt,
              Positive Score: 52, Negative Score: 31, Polarity Score: 0.253, Subjectivity Score: 0.0888,
              Average sentence length: 18.575, Percentage of complex words: 0.2254, Fog Index: 7.5202, 
              Average no of words per sentence:18.575, Complex Word count: 335, Word Count: 935, 
              Syllable count: 2657, Personal Pronouns: 6, Average Word length: 5.426
            
File: blackassign0003.txt,
              Positive Score: 36, Negative Score: 25, Polarity Score: 0.1803, Subjectivity Score: 0.0841,