In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def fetch_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        content_div = soup.find('div', class_=lambda x: x and 'td-post-content' in x.split())

        # Extract the article title
        title = soup.find('h1').get_text()

        # Extract text from paragraphs and list items within ol tags
        paragraphs = content_div.find_all(['p', 'ol', 'ul'])

        # Creating article dir for extracted text
        article_text = ''
        for para in paragraphs:
            if para.name == 'p':
                article_text += para.get_text() + '\n'
            elif para.name in ['ol','ul']:
                for li in para.find_all('li'):
                    article_text += li.get_text() + '\n'

        return title, article_text

    #return error for failed url
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None, None

def main():
    input_file = '/content/drive/MyDrive/Blackcoffer/Input.csv'
    output_dir = 'articles'

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df = pd.read_csv(input_file)

    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']

        if pd.isna(url):
            print(f"Skipping URL_ID {url_id} due to missing URL")
            continue

        title, article_text = fetch_article(url)
        if title and article_text:
            with open(os.path.join(output_dir, f"{url_id}.txt"), 'w', encoding='utf-8') as file:
                file.write(title + '\n' + article_text)


if __name__ == "__main__":
    main()


Error fetching https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error fetching https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


In [None]:
import os
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob

# Ensure nltk resources are downloaded
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def load_stop_words(stopwords_dir):
    stop_words = set(stopwords.words('english'))
    for filename in os.listdir(stopwords_dir):
        if filename.endswith(".txt"):
            filepath = os.path.join(stopwords_dir, filename)
            # Open the file with the correct encoding
            with open(filepath, 'r', encoding='latin-1') as file: # Change from utf-8 to latin-1
                stop_words.update(file.read().splitlines())
    return stop_words

stop_words = load_stop_words('/content/drive/MyDrive/Blackcoffer/StopWords')

with open('/content/drive/MyDrive/Blackcoffer/MasterDictionary/positive-words.txt', 'r', encoding='latin-1') as f:
    positive_words = set(f.read().split())

with open('/content/drive/MyDrive/Blackcoffer/MasterDictionary/negative-words.txt', 'r', encoding='latin-1') as f:
    negative_words = set(f.read().split())

positive_words = positive_words - stop_words
negative_words = negative_words - stop_words

def clean_tokenize_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text.lower())  # Tokenize and lower case
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

def analyze_text(text):
    tokens = clean_tokenize_text(text)
    sentences = sent_tokenize(text)

    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)

    avg_sentence_length = len(tokens) / len(sentences)
    complex_word_count = sum(1 for word in tokens if len(re.findall(r'[aeiouy]', word)) > 2)
    percentage_complex_words = (complex_word_count / len(tokens)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    avg_words_per_sentence = len(tokens) / len(sentences)
    syllable_per_word = sum(len(re.findall(r'[aeiouy]', word)) for word in tokens) / len(tokens)
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    avg_word_length = sum(len(word) for word in tokens) / len(tokens)

    return {
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": len(tokens),
        "SYLLABLE PER WORD": syllable_per_word,
        "PERSONAL PRONOUNS": personal_pronouns,
        "AVG WORD LENGTH": avg_word_length
    }
def main():
    input_file = '/content/drive/MyDrive/Blackcoffer/Input.csv'
    output_file = '/content/drive/MyDrive/Blackcoffer/Output Data Structure.csv'
    articles_dir = '/content/articles'

    input_df = pd.read_csv(input_file)

    output_columns = [
        'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
        'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
        'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
        'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
    ]
    results=[]

    for index, row in input_df.iterrows():
        url_id = row['URL_ID']
        file_path = os.path.join(articles_dir, f"{url_id}.txt")

        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='latin-1') as file:
                text = file.read()

            analysis_results = analyze_text(text)
            analysis_results['URL_ID'] = url_id
            analysis_results['URL'] = row['URL']

            results.append(analysis_results)

        else:
            print(f"File {file_path} not found, skipping URL_ID {url_id}")

    output_df = pd.DataFrame(results, columns=output_columns)
    output_df.to_csv(output_file, index=False)

if __name__ == "__main__":
    main()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


File /content/articles/blackassign0036.txt not found, skipping URL_ID blackassign0036
File /content/articles/blackassign0049.txt not found, skipping URL_ID blackassign0049
