In [None]:
%pip install fake-useragent  # Install fake-useragent library

from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize


nltk.download('punkt')


url = 'https://www.nytimes.com/section/business'

# Create a UserAgent object
user_agent = UserAgent()

# Set the User-Agent header
headers = {'User-Agent': user_agent.chrome}

# Send an HTTP GET request to the URL with headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract headings (h1, h2, h3, etc.)
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    # Extract paragraphs (p)
    paragraphs = soup.find_all('p')

    # Combine headings and paragraphs into a single string variable
    extracted_text = ""
    for item in headings + paragraphs:
        extracted_text += item.get_text() + "\n\n"  # Add each item's text content to the string variable

    # Tokenize the extracted text into sentences
    sentences = sent_tokenize(extracted_text)

    # Filter out sentences with fewer than 6 words
    sentences = [sentence for sentence in sentences if len(sentence.split()) > 5]

    # Remove '\n' characters from sentences
    sentences = [sentence.replace('\n', ' ') for sentence in sentences]

    # Print the list of sentences
    print(sentences)
else:
    print('Failed to retrieve the webpage')


In [None]:
import re

# Define a function to remove extra white spaces from a list of strings
def remove_extra_spaces_from_list(sentences):
    # Iterate over each string in the list
    cleaned_list = []
    for text in sentences:
        # Use regular expression to replace multiple white spaces with a single white space
        cleaned_text = re.sub(r'\s+', ' ', text)
        cleaned_list.append(cleaned_text)
    return cleaned_list

# Example usage
list_with_extra_spaces = ["This    is     a    sentence   with  extra     spaces.", "Another    example   with    extra    spaces."]
modifiedSentences = remove_extra_spaces_from_list(sentences)
print(modifiedSentences)


In [None]:
!pip install transformer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# sentences = ["there is a shortage of capital, and we need extra financing",
#              "growth is strong and we have plenty of liquidity",
#              "there are doubts about our finances",
#              "profits are flat"]
results = nlp(sentences)
print(modifiedSentences)
print(results)
print(type(results))




In [None]:
# Iterate over each sentence and its corresponding result
for sentence, result in zip(modifiedSentences[:10], results[:10]):
    print("Sentence:", sentence)
    print("Sentiment:", result['label'])
    print("Score:", result['score'])
    print()

In [None]:
import matplotlib.pyplot as plt

# Output data
labels = ['positive', 'neutral', 'negative']
label_counts = {'positive': 0, 'neutral': 0, 'negative': 0}
label_scores = {'positive': 0, 'neutral': 0, 'negative': 0}

# Count the occurrences of each label and calculate total score for each label
for result in results:
    label = result['label']
    label_counts[label] += 1
    label_scores[label] += result['score']

# Calculate mean score for each label
mean_scores = [label_scores[label] / label_counts[label] if label_counts[label] > 0 else 0 for label in labels]

# Pie chart
sizes = [label_counts[label] for label in labels]
colors = ['lightgreen', 'lightblue', 'lightcoral']
explode = (0.1, 0, 0)  # explode 1st slice

plt.figure(figsize=(8, 6))
patches, texts, autotexts = plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)

# Add mean score value for each label as annotation
for i, (label, mean_score) in enumerate(zip(labels, mean_scores)):
    texts[i].set_text(f"{labels[i]} ({mean_score:.2f})")

plt.title('Sentiment Analysis Results')
plt.axis('equal')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Extract scores
scores = [result['score'] for result in results]

# Count the frequency of each score
score_counts = {}
for score in scores:
    if score in score_counts:
        score_counts[score] += 1
    else:
        score_counts[score] = 1

# Extract scores and their frequencies
sorted_scores = sorted(score_counts.keys())
frequencies = [score_counts[score] for score in sorted_scores]

# Filter scores and frequencies for scores between 0.99 and 1.0
filtered_scores = []
filtered_frequencies = []
for score, freq in zip(sorted_scores, frequencies):
    if 0.97 <= score <= 1.0:
        filtered_scores.append(score)
        filtered_frequencies.append(freq)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(filtered_scores, filtered_frequencies, marker='o', linestyle='-')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title('Detailed Representation of Score Values between 0.99 and 1.0')
plt.grid(True)
plt.show()
