In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
data = pd.read_excel('Input.xlsx')

In [3]:
data = data[0:100]

In [4]:
data["URL_ID"][1]

'blackassign0002'

In [5]:
data.tail()

Unnamed: 0,URL_ID,URL
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...
97,blackassign0098,https://insights.blackcoffer.com/contribution-...
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...
99,blackassign0100,https://insights.blackcoffer.com/how-will-covi...


In [6]:
data['URL'].value_counts().sum()

100

In [14]:
urls = data['URL'].dropna().tolist()

In [15]:
urls[0]

'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/'

In [16]:
from newspaper import Article

In [17]:
import csv

In [18]:
def scrape_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.title, article.text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None, None

In [19]:
scraped_data = []

for url in urls:
    title, text = scrape_article(url)
    if title and text:
        scraped_data.append({'url': url, 'title': title, 'text': text})

Error scraping https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: Article `download()` failed with 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ on URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error scraping https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: Article `download()` failed with 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ on URL https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


In [22]:
with open('scraped_articles.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['url', 'title', 'text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for data in scraped_data:
        writer.writerow(data)

print("Scraping completed and data saved to scraped_articles.csv")

Scraping completed and data saved to scraped_articles.csv


In [23]:
df = pd.read_csv("scraped_articles.csv")

In [25]:
x = pd.read_excel('Input.xlsx')

In [26]:
df["URL_ID"] = x["URL_ID"]

In [27]:
df.head()

Unnamed: 0,url,title,text,URL_ID
0,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,We have seen a huge development and dependence...,blackassign0001
1,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,"Throughout history, from the industrial revolu...",blackassign0002
2,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",Introduction\n\nIn the span of just a few deca...,blackassign0003
3,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,"The way we live, work, and communicate has unq...",blackassign0004
4,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,The year 2040 is poised to witness a continued...,blackassign0005


In [None]:
df['text'][0]

In [28]:
import pandas as pd
import re
from textblob import TextBlob
import textstat
import nltk
from collections import Counter
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
stopwords_files = ['StopWords/StopWords_Auditor.txt' , 'StopWords/StopWords_Currencies.txt' , 
                   'StopWords/StopWords_DatesandNumbers.txt','StopWords/StopWords_Generic.txt' , 
                   'StopWords/StopWords_GenericLong.txt' , 'StopWords/StopWords_Geographic.txt'
                    ,'StopWords/StopWords_Names.txt']

custom_stopwords = set()
for file in stopwords_files:
    with open(file , 'r') as f:
        for line in f:
            custom_stopwords.add(line.strip().lower())
            


In [31]:
positive_words = set(open('MasterDictionary/positive-words.txt').read().split())
negative_words = set(open('MasterDictionary/negative-words.txt').read().split())

In [32]:
output_structure = pd.read_excel("Output Data Structure.xlsx")

In [33]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n' , ' ' , text)
    return text

In [34]:
def analyze_text(row):
    cleaned_text = clean_text(row['text'])
    blob = TextBlob(cleaned_text)
    
    sentences = nltk.sent_tokenize(row['text'])
    words = nltk.word_tokenize(row['text'])
    filtered_words = [word for word in words if word.lower() not in custom_stopwords]
    word_count = len(filtered_words)
    
    positive_score = sum(1 for word in filtered_words if word in positive_words)
    negative_score = sum(1 for word in filtered_words if word in negative_words)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    
    avg_sentence_length = word_count / len(sentences) if len(sentences) else 0
    
    complex_word_count = sum(1 for word in filtered_words if count_syllables(word) > 2)
    complex_word_percentage = (complex_word_count / word_count) * 100 if word_count else 0
    
    fog_index = 0.4 * (avg_sentence_length + complex_word_percentage) if avg_sentence_length else 0
    avg_words_per_sentence = word_count / len(sentences) if len(sentences) else 0
    
    syllable_count = sum(count_syllables(word) for word in filtered_words)
    personal_pronoun_count = count_personal_pronouns(cleaned_text)
    avg_word_length = sum(len(word) for word in filtered_words) / word_count if word_count else 0
    
    # Return a dictionary matching the output structure
    return {
        'URL_ID' : row['URL_ID'],
        'URL': row['url'],
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': complex_word_percentage,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_count / word_count if word_count else 0,
        'PERSONAL PRONOUNS': personal_pronoun_count,
        'AVG WORD LENGTH': avg_word_length
    }


In [35]:
def count_syllables(word):
    vowels = 'aeiouy'
    word = word.lower()
    syllables = 0
    if word[0] in vowels:
        syllables += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            syllables += 1
    if word.endswith('e'):
        syllables -= 1
    if syllables == 0:
        syllables += 1
    return syllables

In [36]:
def count_personal_pronouns(text):
    pronouns = ['i', 'we', 'my', 'ours', 'us']
    words = text.split()
    count = sum(1 for word in words if word in pronouns)
    return count

In [37]:
output_data = df.apply(lambda row: analyze_text(row), axis=1)

# Convert the results to a DataFrame
output_df = pd.DataFrame(output_data.tolist())

# Ensure the columns are in the exact order as in the output structure
output_df = output_df[output_structure.columns]

# Save the output to a new Excel file
output_df.to_excel('analyzed_output.xlsx', index=False)

print("Textual analysis completed and data saved to analyzed_output.xlsx")

Textual analysis completed and data saved to analyzed_output.xlsx
