In [34]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import cmudict
pronouncing_dict = cmudict.dict()
#nltk.download('cmudict')

In [35]:
df = pd.read_excel('Input.xlsx')
df.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


In [36]:
df['POSITIVE SCORE'] = 0  
df['NEGATIVE SCORE'] = 0  
df['POLARITY SCORE'] = 0  
df['SUBJECTIVITY SCORE'] = 0 
df['AVG SENTENCE LENGTH'] = 0 
df['PERCENTAGE OF COMPLEX WORDS'] = 0 
df['FOG INDEX'] = 0
df['AVG NUMBER OF WORDS PER SENTENCE'] = 0
df['COMPLEX WORD COUNT'] = 0
df['WORD COUNT'] = 0
df['SYLLABLE PER WORD'] = 0
df['PERSONAL PRONOUNS'] = 0
df['AVG WORD LENGTH'] = 0
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,0,0,0,0,0,0,0,0,0,0,0,0,0
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,0,0,0,0,0,0,0,0,0,0,0,0,0
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,0,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
stopwords_path = 'stopwords'
stopwords_file = os.listdir(stopwords_path)
stop_words = []

for file in stopwords_file:
    if file.endswith('.txt'):
        file_path = os.path.join(stopwords_path, file)
    
        with open(file_path, 'r') as f:
            stop_words.extend(f.read().strip().lower().splitlines())

In [38]:
def createWordDict(filename):
    masterdict_path = 'master_dictionary'
    masterdict_files = os.listdir(masterdict_path)
    words = []
    
    for file in masterdict_files:
        if file == filename:
            file_path = os.path.join(masterdict_path, file)
    
            with open(file_path, 'r') as f:
                master_dictionary = f.read().strip().lower().splitlines()
        
                for word in master_dictionary:
                    if word.lower() not in stop_words:
                        words.append(word.strip())
    return words

In [39]:
positive_words = []
positive_words = createWordDict('positive-words.txt')

In [40]:
len(positive_words)

1907

In [41]:
negative_words = []
negative_words = createWordDict('negative-words.txt')

In [42]:
len(negative_words)

4693

In [43]:
output_directory = 'extracted_articles'
os.makedirs(output_directory, exist_ok=True)

In [44]:
for index,row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    webpage = requests.get(url)
    if webpage.status_code == 404:
        title = 'Title not found'
        article_content = 'Page not found'
    else:
        soup = BeautifulSoup(webpage.text,'lxml')
        if (soup.find_all('h1')):
            title = soup.find_all('h1')[0].text
        else:
            title = 'Page not found'
            
        for j in soup.find_all('div', class_='td-post-content'):
            for pre in j.find_all('pre'):
                pre.extract() 
                article_content = j.text
            
    output_file = os.path.join(output_directory, f'{url_id}.txt')
    with open(output_file, 'w', encoding='utf-8') as text_file:
        text_file.write(f'{title}\n\n')
        text_file.write(f'{article_content}')

In [45]:
def remove_other_char(content):
    content = re.sub('[^a-zA-Z0-9]',' ',content)
    return content

def remove_url(content):
    return re.sub(r'http:\S+',' ',content)

def remove_stopwords(content):
    clean_data = []
    for i in content.split():
        if i.strip().lower() not in stop_words:
            clean_data.append(i.strip().lower())
    return " ".join(clean_data)

def personalPronouns(content):
    pattern = r'\b(?:I|we|my|ours|us)\b'
    matches = re.findall(pattern, content, flags=re.IGNORECASE)
    personal_pronouns = [match for match in matches if match != 'US']
    pronoun_count = len(personal_pronouns)
    return pronoun_count

def data_cleaning(content):
    content = remove_url(content)
    content = remove_other_char(content)
    content = remove_stopwords(content)
    return content

In [46]:
def calculatePosNegScore(tokens, words, symbol):
    score = 0
    for token in tokens:
        if (symbol == '+'):
            if (token.lower() in words):
                score += 1
        elif (symbol == '-'):
            if (token.lower() in words):
                score -= 1
    return abs(score)

def calculatePolarityScore(pos_score,neg_score):
    polarity_score = round((pos_score - neg_score)/ ((pos_score + neg_score) + 0.000001),2)
    return polarity_score

def calculateSubjectivityScore(pos_score,neg_score,word_count):
    subjectivity_score = round((pos_score + neg_score)/ ((word_count) + 0.000001),2)
    return subjectivity_score


def count_syllables(word):
    if word.lower() in pronouncing_dict:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in pronouncing_dict[word.lower()]])
    else:
        return 1
    
def count_vowels(word):
    exceptions = ["es", "ed"]      
    vowels = "aeiouAEIOU"
    vowel_count = 0
    
    for char in word:
        if char in vowels:
            vowel_count += 1
        
    for exception in exceptions:
        if word.lower().endswith(exception):
            vowel_count = vowel_count - 1
    return vowel_count

In [47]:
output_directory = 'extracted_articles'

article_files = os.listdir(output_directory)

for filename in article_files:
    if filename.endswith('.txt'):
        file_path = os.path.join(output_directory, filename)
        index = df[df.URL_ID == float(filename.rstrip('.txt'))].index[0]
        
        with open(file_path, 'r', encoding='utf-8') as text_file:
            article_text = text_file.read()
            
            sentences = sent_tokenize(article_text)
            df.at[index, 'AVG SENTENCE LENGTH'] = len(sentences)
            df.at[index, 'PERSONAL PRONOUNS'] = personalPronouns(article_text)

            filtered_text = data_cleaning(article_text)
            tokens = word_tokenize(filtered_text)
            
            positive_score = df.at[index,'POSITIVE SCORE'] = calculatePosNegScore(tokens, positive_words, '+')
            negative_score = df.at[index,'NEGATIVE SCORE'] = calculatePosNegScore(tokens, negative_words, '-')
            df.at[index,'POLARITY SCORE'] = calculatePolarityScore(positive_score,negative_score)
            df.at[index,'SUBJECTIVITY SCORE'] = calculateSubjectivityScore(positive_score,negative_score,len(tokens))
            df.at[index,'AVG NUMBER OF WORDS PER SENTENCE'] = round(len(tokens)/len(sentences))
            df.at[index,'WORD COUNT'] = len(tokens)
            complex_word_count = df.at[index,'COMPLEX WORD COUNT'] = sum(1 for word in tokens if count_syllables(word) > 2)
            complex_words_percent = df.at[index,'PERCENTAGE OF COMPLEX WORDS'] = round(complex_word_count/len(tokens),2) 
            df.at[index,'FOG INDEX'] = round(0.4 * (len(sentences) + complex_words_percent),2)
            syllable_counts = df.at[index,'SYLLABLE PER WORD'] = sum([count_vowels(word) for word in tokens])
            
            total_characters = sum(len(word) for word in tokens)
            df.at[index,'AVG WORD LENGTH'] = round(total_characters/len(tokens),2)

In [48]:
df.head(10)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,79,23,0.55,0.12,79,0.43,31.77,10,350,822,2532,2,7.83
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38,13,0.49,0.19,24,0.46,9.78,11,127,274,839,3,8.11
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,19,27,-0.17,0.09,68,0.39,27.36,8,204,520,1310,3,7.23
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,34,26,0.13,0.09,59,0.41,23.76,11,257,633,1675,7,7.31
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,34,26,0.13,0.09,59,0.41,23.76,11,257,633,1675,7,7.31
5,2893.8,https://insights.blackcoffer.com/rise-of-chatb...,49,11,0.63,0.11,64,0.42,25.77,9,240,565,1501,5,7.53
6,3355.6,https://insights.blackcoffer.com/rise-of-e-hea...,31,10,0.51,0.08,44,0.42,17.77,12,231,546,1488,1,7.47
7,3817.4,https://insights.blackcoffer.com/how-does-mark...,57,6,0.81,0.08,95,0.47,38.19,9,385,822,2254,5,7.57
8,4279.2,https://insights.blackcoffer.com/how-advertise...,7,0,1.0,0.04,27,0.38,10.95,7,76,198,532,1,7.54
9,4741.0,https://insights.blackcoffer.com/negative-effe...,22,48,-0.37,0.14,49,0.47,19.79,10,238,506,1423,6,7.63


In [49]:
df.tail(10)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
104,48612.0,https://insights.blackcoffer.com/impact-of-cov...,48,78,-0.24,0.12,65,0.34,26.14,16,358,1065,2364,9,6.44
105,49073.8,https://insights.blackcoffer.com/should-celebr...,53,19,0.47,0.09,64,0.41,25.76,12,330,798,2135,2,7.39
106,49535.6,https://insights.blackcoffer.com/how-prepared-...,16,25,-0.22,0.1,33,0.32,13.33,12,127,396,930,3,6.72
107,49997.4,https://insights.blackcoffer.com/how-will-covi...,41,95,-0.4,0.17,65,0.32,26.13,12,254,792,1907,7,6.83
108,50459.2,https://insights.blackcoffer.com/controversy-a...,31,50,-0.23,0.2,44,0.34,17.74,9,135,398,1015,5,6.97
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5,28,-0.7,0.09,29,0.32,11.73,12,110,349,787,1,6.59
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...,21,62,-0.49,0.1,49,0.26,19.7,16,207,802,1903,3,6.74
111,51844.6,https://insights.blackcoffer.com/what-are-the-...,86,31,0.47,0.13,71,0.36,28.54,13,320,889,2231,0,7.02
112,52306.4,https://insights.blackcoffer.com/marketing-dri...,85,32,0.45,0.13,70,0.36,28.14,13,319,887,2228,0,7.03
113,52768.2,https://insights.blackcoffer.com/continued-dem...,86,31,0.47,0.13,70,0.36,28.14,13,320,885,2229,0,7.04
