##  Objective
The objective of this assignment is to extract textual data from the given URLs, perform sentiment and readability analysis, and produce the required output as per the given structure.

In [1]:
import os
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk import sent_tokenize

# Download NLTK punkt tokenizer (needed for sentence splitting)
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# os.chdir("BLACKCOFFER")   # REMOVE or COMMENT OUT
print("Current working directory:", os.getcwd())
print("Files available:", os.listdir())


Current working directory: C:\Users\Acer\BLACKCOFFER
Files available: ['.ipynb_checkpoints', 'Input.xlsx', 'negative-words.txt', 'Objective.docx', 'Output Data Structure.xlsx', 'positive-words.txt', 'StopWords_Auditor.txt', 'StopWords_Currencies.txt', 'StopWords_DatesandNumbers.txt', 'StopWords_Generic.txt', 'StopWords_GenericLong.txt', 'StopWords_Geographic.txt', 'StopWords_Names.txt', 'Text Analysis.docx', 'Untitled.ipynb']


In [5]:
stopwords = set()

for file in os.listdir():
    if file.startswith("StopWords") and file.endswith(".txt"):
        with open(file, "r", encoding="latin-1") as f:
            for word in f:
                stopwords.add(word.strip().lower())

print("✅ Total StopWords Loaded:", len(stopwords))



✅ Total StopWords Loaded: 12768


In [6]:
with open("positive-words.txt", "r", encoding="latin-1") as f:
    positive_words = [line.strip().lower() for line in f if line.strip()]

with open("negative-words.txt", "r", encoding="latin-1") as f:
    negative_words = [line.strip().lower() for line in f if line.strip()]

print("✅ Positive Words:", len(positive_words))
print("✅ Negative Words:", len(negative_words))


✅ Positive Words: 2006
✅ Negative Words: 4783


In [7]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    words = [w for w in text.split() if w not in stopwords]
    return words

def syllable_count(word):
    vowels = "aeiou"
    word = word.lower()
    count = 0
    if word and word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith(("es","ed")):
        count -= 1
    return max(1, count)

def extract_article(url):
    try:
        page = requests.get(url, timeout=10)
        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        return " ".join(paragraphs)
    except:
        return ""

def analyze_text(text):
    sentences = sent_tokenize(text)
    words = clean_text(text)

    pos_score = sum(1 for w in words if w in positive_words)
    neg_score = sum(1 for w in words if w in negative_words)

    polarity = (pos_score - neg_score) / ((pos_score + neg_score) + 1e-6)
    subjectivity = (pos_score + neg_score) / (len(words) + 1e-6)

    word_count = len(words)
    sentence_count = len(sentences) if sentences else 1
    avg_sentence_length = word_count / sentence_count

    complex_words = [w for w in words if syllable_count(w) > 2]
    complex_count = len(complex_words)
    pct_complex = complex_count / (word_count + 1e-6)
    fog_index = 0.4 * (avg_sentence_length + pct_complex)

    syllables_per_word = sum(syllable_count(w) for w in words) / (word_count + 1e-6)
    avg_word_len = sum(len(w) for w in words) / (word_count + 1e-6)
    pronouns = len(re.findall(r"\b(I|we|my|ours|us)\b", text, re.I))

    return {
        "POSITIVE SCORE": pos_score,
        "NEGATIVE SCORE": neg_score,
        "POLARITY SCORE": polarity,
        "SUBJECTIVITY SCORE": subjectivity,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": pct_complex,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_sentence_length,
        "COMPLEX WORD COUNT": complex_count,
        "WORD COUNT": word_count,
        "SYLLABLE PER WORD": syllables_per_word,
        "PERSONAL PRONOUNS": pronouns,
        "AVG WORD LENGTH": avg_word_len
    }


In [8]:
df = pd.read_excel("Input.xlsx")
print("✅ Input File Loaded:", df.shape[0], "rows")
df.head()


✅ Input File Loaded: 147 rows


Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...


In [10]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [11]:
results = []

for idx, row in df.iterrows():
    print(f"Processing {idx+1}/{len(df)}: {row['URL']}")
    text = extract_article(row["URL"])
    metrics = analyze_text(text)
    metrics["URL_ID"] = row["URL_ID"]
    metrics["URL"] = row["URL"]
    results.append(metrics)

results_df = pd.DataFrame(results)
results_df.head()


Processing 1/147: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
Processing 2/147: https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
Processing 3/147: https://insights.blackcoffer.com/roas-dashboard-for-campaign-wise-google-ads-budget-tracking-using-google-ads-ap/
Processing 4/147: https://insights.blackcoffer.com/efficient-processing-and-analysis-of-financial-data-from-pdf-files-addressing-formatting-inconsistencies-and-ensuring-data-integrity-for-a-toyota-dealership-management-firm/
Processing 5/147: https://insights.blackcoffer.com/development-of-ea-robot-for-automated-trading/
Processing 6/147: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
Processing 7/147: http

Unnamed: 0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH,URL_ID,URL
0,9,0,1.0,0.039823,25.111111,0.517699,10.251524,25.111111,117,226,2.685841,3,8.230088,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,12,7,0.263158,0.039256,12.410256,0.402893,5.12526,12.410256,195,484,2.429752,9,7.483471,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,14,2,0.75,0.060837,23.909091,0.452471,9.744625,23.909091,119,263,2.604563,3,7.912547,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,26,10,0.444444,0.075789,15.833333,0.549474,6.553123,15.833333,261,475,2.692632,6,8.16,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,Netclan20241021,https://insights.blackcoffer.com/development-o...


In [12]:
output_template = pd.read_excel("Output Data Structure.xlsx")

for col in results_df.columns:
    if col in output_template.columns:
        output_template[col] = results_df[col]

output_template.to_excel("Final_Output.xlsx", index=False)
print("✅ Final Output Saved: Final_Output.xlsx")


✅ Final Output Saved: Final_Output.xlsx


In [13]:
# Create text files for each article using the same extract_article function

for idx, row in df.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]
    print(f"Saving article for URL_ID {url_id}")
    
    text = extract_article(url)
    
    # Save the article into a text file named <URL_ID>.txt
    filename = f"{url_id}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text if text else "NA")

print("✅ All articles saved as <URL_ID>.txt files")


Saving article for URL_ID Netclan20241017
Saving article for URL_ID Netclan20241018
Saving article for URL_ID Netclan20241019
Saving article for URL_ID Netclan20241020
Saving article for URL_ID Netclan20241021
Saving article for URL_ID Netclan20241022
Saving article for URL_ID Netclan20241023
Saving article for URL_ID Netclan20241024
Saving article for URL_ID Netclan20241025
Saving article for URL_ID Netclan20241026
Saving article for URL_ID Netclan20241027
Saving article for URL_ID Netclan20241028
Saving article for URL_ID Netclan20241029
Saving article for URL_ID Netclan20241030
Saving article for URL_ID Netclan20241031
Saving article for URL_ID Netclan20241032
Saving article for URL_ID Netclan20241033
Saving article for URL_ID Netclan20241034
Saving article for URL_ID Netclan20241035
Saving article for URL_ID Netclan20241036
Saving article for URL_ID Netclan20241037
Saving article for URL_ID Netclan20241038
Saving article for URL_ID Netclan20241039
Saving article for URL_ID Netclan2

In [14]:
import zipfile

# Name of the zip file
zip_filename = "Articles.zip"

with zipfile.ZipFile(zip_filename, "w") as zipf:
    for idx, row in df.iterrows():
        file_name = f"{row['URL_ID']}.txt"
        if os.path.exists(file_name):  # only add if the file exists
            zipf.write(file_name)

print(f"✅ All article text files zipped into {zip_filename}")


✅ All article text files zipped into Articles.zip
