In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1


In [60]:
import os
import pandas as pd
import nltk
import textstat
import re

In [47]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

# Creating a set of stop words that is removed from the entire corpus of text
stop_words = set(stopwords.words('english'))

with open('positive-words.txt', 'r') as file:
    pos_words = file.read().splitlines()

with open('negative-words.txt', 'r') as file:
    neg_words = file.read().splitlines()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anuragprasad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [48]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [49]:
keys = ['positive', 'negative']
values = [pos_words, neg_words]
master_dict = dict(zip(keys,values))

In [50]:
data = pd.read_csv('Input.xlsx - Sheet1.csv')

In [51]:
content = []
for file in os.listdir('scraped_articles'):

    with open(f'scraped_articles/{file}', 'r') as file:
        article = file.read()
    content.append(article)
data['content'] = content

In [52]:
# This function calculates metrics 1-4:
def metrics_1_4(data):
    for i in range(len(data)):
        text_tokens = word_tokenize(data.iloc[i]['content'])
        filtered_words = [w for w in text_tokens]
        filtered_text = ' '.join(filtered_words)

        pos_score = 0
        neg_score = 0
        for word in filtered_text.split(' '):
            if word in pos_words:
                pos_score += 1
            elif word in neg_words:
                neg_score += 1

        neg_score *= -1
        data.at[i, 'POSITIVE SCORE'] = pos_score
        data.at[i, 'NEGATIVE SCORE'] = neg_score
        
        total_score = pos_score + neg_score
        data.at[i, 'POLARITY SCORE'] = (pos_score - neg_score)/(total_score + 0.000001)
        data.at[i, 'SUBJECTIVITY SCORE'] = total_score/(len(filtered_words) + 0.000001)

In [53]:
# This finds all the attributes related to fog index and metrics 5-10
def metrics_5_10(data):
    for i in range(len(data)):
        article_text = data.iloc[i]['content']
        text_tokens = word_tokenize(article_text)
        text_words = [word for word in text_tokens if word.strip() and word.isalpha()]
        sentences = re.split(r'[.!?]+', article_text)
        sentences = [s for s in sentences if s.strip()]

       # Finding complex words with more than 1 syllable using textstat library
        complex_words = [word for word in text_tokens if textstat.syllable_count(word)>1]
        
        # Average sentence length involved calculating the extra characters, punctuation as well
        avg_sentence_length = len(article_text)/len(sentences)
        complex_word_percent = len(complex_words)/len(text_tokens)
        avg_words_per_sentence = len(text_words)/len(sentences)

        data.at[i,'AVG SENTENCE LENGTH'] = avg_sentence_length
        data.at[i, 'PERCENTAGE OF COMPLEX WORDS'] = complex_word_percent
        data.at[i, 'FOG INDEX'] = 0.4*(avg_sentence_length+complex_word_percent)
        data.at[i, 'AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence
        data.at[i, 'COMPLEX WORD COUNT'] = len(complex_words)

        # Count of cleaned text (without stop words):
        filtered_words = [word for word in text_words if word not in stop_words]
        data.at[i, 'WORD COUNT'] = len(filtered_words)

In [54]:
def avg_word_length(words):
    count_char = 0
    for word in words:
        count_char+= len(word)
    return count_char/len(words)

In [55]:
def count_syllables(words):
    count = 0
    for word in words:
        count += max(1, textstat.syllable_count(word))
    return count


In [56]:
# Metrics for 11-13:
def metrics_11_13(data):
    for i in range(len(data)):
        article_text = data.iloc[i]['content']
        text_tokens = word_tokenize(article_text)
        text_words = [word for word in text_tokens if word.strip() and word.isalpha()]
        syllables_in_text = count_syllables(text_words) 

        # Stores the average number of syllables per words
        data.at[i, 'SYLLABLE PER WORD'] = syllables_in_text/len(text_words)


        pronouns = ['I', 'we', 'my', 'ours', 'us']
        pronoun_count = sum(1 for word in text_words if word in pronouns)
        data.at[i, 'PERSONAL PRONOUNS'] = pronoun_count

        # Using the function for avg-word length:
        data.at[i, 'AVG WORD LENGTH'] = avg_word_length(text_words)
        

In [58]:
metrics_1_4(data = data)
metrics_5_10(data = data)
metrics_11_13(data= data)


In [59]:
data

Unnamed: 0,URL_ID,URL,content,POSITIVE_SCORE,NEGATIVE_SCORE,POLARITY_SCORE,SUBJECTIVITY_SCORE,AVG_SENTENCE_LENGTH,PERCENTAGE_COMPLEX_WORDS,FOG_INDEX,AVG_WORDS_PER_SENTENCE,COMPLEX_WORD_COUNT,WORD_COUNT,SYLLABLE_COUNT_PER_WORD,PERSONAL_PRONOUNS,AVERAGE_WORD_LENGTH
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...,ROAS Dashboard for Campaign-Wise Google Ads Bu...,9.0,-7.0,7.999996,0.003289,101.416667,0.435855,40.741009,14.416667,265.0,405.0,1.730250,1.0,5.308285
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...,Analyzing the Impact of Positive Emotions and ...,11.0,-3.0,1.750000,0.014787,142.115385,0.526802,57.056875,16.769231,285.0,339.0,2.174312,2.0,6.442661
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...,Enhancing Front-End Features and Functionality...,12.0,-7.0,3.799999,0.004950,125.340909,0.350495,50.276562,19.318182,354.0,496.0,1.623529,7.0,4.855294
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...,Google Local Service Ads Missed Calls and Mess...,0.0,0.0,0.000000,0.000000,90.000000,0.538462,36.215385,12.000000,7.0,11.0,1.916667,0.0,6.333333
4,Netclan20241021,https://insights.blackcoffer.com/development-o...,Splitting of Songs into its Vocals and Instrum...,0.0,0.0,0.000000,0.000000,75.000000,0.545455,30.218182,10.000000,6.0,6.0,1.900000,0.0,6.300000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...,Traction Dashboards of Marketing Campaigns and...,2.0,-1.0,2.999997,0.002519,123.200000,0.390428,49.436171,17.650000,155.0,238.0,1.654391,5.0,5.235127
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...,Google LSA API Data Automation and Dashboardin...,0.0,0.0,0.000000,0.000000,71.000000,0.700000,28.680000,9.000000,7.0,8.0,2.333333,0.0,6.666667
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...,Healthcare Data Analysis - Blackcoffer Insight...,10.0,-9.0,18.999981,0.002041,95.482759,0.402041,38.353920,14.758621,197.0,272.0,1.621495,13.0,4.843458
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...,Google Local Service Ads (LSA) Data Warehouse ...,4.0,0.0,1.000000,0.013841,181.900000,0.460208,72.944083,25.700000,133.0,196.0,1.797665,0.0,5.494163


In [42]:
data.head()

Unnamed: 0,URL_ID,URL,content,POSITIVE_SCORE,NEGATIVE_SCORE,POLARITY_SCORE,SUBJECTIVITY_SCORE,AVG_SENTENCE_LENGTH,PERCENTAGE_COMPLEX_WORDS,FOG_INDEX,AVG_WORDS_PER_SENTENCE,COMPLEX_WORD_COUNT,WORD_COUNT
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...,ROAS Dashboard for Campaign-Wise Google Ads Bu...,9.0,-7.0,7.999996,0.003289,101.416667,0.435855,40.741009,14.416667,265.0,405.0
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...,Analyzing the Impact of Positive Emotions and ...,11.0,-3.0,1.75,0.014787,142.115385,0.526802,57.056875,16.769231,285.0,339.0
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...,Enhancing Front-End Features and Functionality...,12.0,-7.0,3.799999,0.00495,125.340909,0.350495,50.276562,19.318182,354.0,496.0
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...,Google Local Service Ads Missed Calls and Mess...,0.0,0.0,0.0,0.0,90.0,0.538462,36.215385,12.0,7.0,11.0
4,Netclan20241021,https://insights.blackcoffer.com/development-o...,Splitting of Songs into its Vocals and Instrum...,0.0,0.0,0.0,0.0,75.0,0.545455,30.218182,10.0,6.0,6.0


In [57]:
df = pd.DataFrame(columns = ['POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 
                             'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 
                             'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 
                             ' AVG NUMBER OF WORDS PER SENTENCE',
                              'COMPLEX WORD COUNT', 'WORD COUNT',
                              'SYLLABLE PER WORD', 'PERSONAL PRONOUNS',
                              'AVG WORD LENGTH'])