<a href="https://www.kaggle.com/code/anandratna/blackoffer-assignment?scriptVersionId=229793811" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import re
import requests
import pandas as pd
import nltk
import textstat
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from google.colab import files

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Load positive and negative word lists
pos_words = set(open('/kaggle/input/pwnw-word/positive-words.txt', encoding="ISO-8859-1").read().split())
neg_words = set(open('/kaggle/input/pwnw-word/negative-words.txt', encoding="ISO-8859-1").read().split())

# File paths
input_file = "/kaggle/input/ipop-file/Input.xlsx"
output_file = "/kaggle/working/"

def extract_text(url):
    # extraction of article
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.title.text if soup.title else ""
        paragraphs = soup.find_all('p')
        article_text = ' '.join(p.text for p in paragraphs)
        return title + "\n" + article_text
    except Exception as e:
        print(f"Error extracting {url}: {e}")
        return ""

def compute_text_metrics(text):
    # Perform the analysis in required metrics
    words = word_tokenize(text.lower())
    sentences = sent_tokenize(text)
    words_cleaned = [word for word in words if word.isalnum() and word not in stop_words]
    
    pos_score = sum(1 for word in words_cleaned if word in pos_words)
    neg_score = sum(1 for word in words_cleaned if word in neg_words)
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (len(words_cleaned) + 0.000001)
    
    avg_sentence_length = len(words) / max(len(sentences), 1)
    complex_words = [word for word in words if textstat.syllable_count(word) > 2]
    percentage_complex_words = len(complex_words) / max(len(words), 1)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(words) / max(len(sentences), 1)
    complex_word_count = len(complex_words)
    word_count = len(words_cleaned)
    syllables_per_word = sum(textstat.syllable_count(word) for word in words) / max(len(words), 1)
    
    personal_pronouns = len(re.findall(r"\b(I|we|my|ours|us)\b", text, re.I))
    avg_word_length = sum(len(word) for word in words) / max(len(words), 1)
    
    return [
        pos_score, neg_score, polarity_score, subjectivity_score, avg_sentence_length,
        percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count,
        word_count, syllables_per_word, personal_pronouns, avg_word_length
    ]

# Read input file
df = pd.read_excel(input_file)

# Process all URLs
data_output = []
for index, row in df.iterrows():
    url_id, url = row['URL_ID'], row['URL']
    print(f"Processing {url_id}: {url}")
    
    text = extract_text(url)
    metrics = compute_text_metrics(text) if text else [0] * 13
    data_output.append([url_id, url] + metrics)

# Save results to output file
columns = ["URL_ID", "URL", "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
           "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE",
           "COMPLEX WORD COUNT", "WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"]
out_df = pd.DataFrame(data_output, columns=columns)
out_df.to_excel(output_file, index=False)