In [23]:
import requests
import pandas as pd
import re, os
import nltk
from bs4 import BeautifulSoup
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
import hashlib
import difflib
from transformers import pipeline

In [2]:
# Ensure necessary NLTK components are available
nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [8]:


def clean_text(text):
   text = text.lower()  # Convert to lowercase
   text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
   text = re.sub(r'[^\w\s.%]', '', text)  # Remove special characters except percentages
   text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
   return text


In [9]:
input_1 = "<p>Stock prices <b>surged</b> by 5% today!</p>"
clean_text(input_1)

'stock prices surged by 5% today'

In [10]:
input_2 = "Tesla's earnings report: **better-than-expected**!  🚀🚀"
clean_text(input_2)

'teslas earnings report betterthanexpected'

In [11]:
input_3 = "<div> The market   is volatile!!! Prices dropped 10%.  </div>"
clean_text(input_3)

'the market is volatile prices dropped 10%.'

In [None]:
def impute_missing_data(df):
  df['published_date'].fillna('1980-01-01', inplace=True)  # Default for missing dates
  df['source'].fillna('Unknown', inplace=True)
  df['summary'].fillna('No Summary Available', inplace=True)
  return df


The following test script provides the comparison of the input before imputation and after imputation. Though the code is standardised, dates imputed with the string value 'Unknown' should be validated with date type for downstream tasks

In [14]:

# Sample data:
# published_data have 2 missing dates
# source column have 2 missing values
# summary column has two missing summaries
data = {
    'published_date': ['2024-02-01', '2024-02-02', '2024-02-03', None, None, '2024-02-06'],
    'source': ['Reuters', 'Bloomberg', 'CNBC', None, 'Forbes', None],
    'summary': [
        "Stock prices surged after strong earnings report.",
        "Market remains stable despite geopolitical tensions.",
        "Investors optimistic about the tech sector’s growth.",
        None,  # Missing summary
        "Earnings decline could lead to a market downturn.",  # Negative sentiment
        None  # Missing summary
    ]
}

# create a pandas dataFrame
df = pd.DataFrame(data)

# Print raw data
print("\n=== Before Imputation ===")
print(df)

# Sentiment Analysis Before Imputation
df['sentiment_before'] = df['summary'].apply(lambda x: sia.polarity_scores(x)['compound'] if pd.notna(x) else None)

# Apply Imputation Function
df = impute_missing_data(df)

# Sentiment Analysis After Imputation
df['sentiment_after'] = df['summary'].apply(lambda x: sia.polarity_scores(x)['compound'] if pd.notna(x) else None)

# Print cleaned data
print("\n=== After Imputation ===")
print(df)

# Show impact on sentiment analysis
print("\n=== Sentiment Score Comparison ===")
for i in range(len(df)):
    print(f"Row {i+1}: Before = {df['sentiment_before'][i]}, After = {df['sentiment_after'][i]}")



=== Before Imputation ===
  published_date     source                                            summary
0     2024-02-01    Reuters  Stock prices surged after strong earnings report.
1     2024-02-02  Bloomberg  Market remains stable despite geopolitical ten...
2     2024-02-03       CNBC  Investors optimistic about the tech sector’s g...
3           None       None                                               None
4           None     Forbes  Earnings decline could lead to a market downturn.
5     2024-02-06       None                                               None

=== After Imputation ===
  published_date     source  \
0     2024-02-01    Reuters   
1     2024-02-02  Bloomberg   
2     2024-02-03       CNBC   
3        Unknown    Unknown   
4        Unknown     Forbes   
5     2024-02-06    Unknown   

                                             summary  sentiment_before  \
0  Stock prices surged after strong earnings report.            0.5106   
1  Market remains stable des

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['published_date'].fillna('Unknown', inplace=True)  # Default for missing dates
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['source'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obje

Detect Bias

In [15]:
import pandas as pd

# Define a dictionary with credibility scores for different sources
#  following credibility scores are arbitrarily assigned for the purpose of example only

SOURCE_CREDIBILITY = {"Reuters": 5, "Bloomberg": 5, "CNBC": 4, "Financial   Times": 4, "Forbes": 3,"YahooFinance": 3, "Unknown": 0 }
def detect_bias(df):
    source_counts = Counter(df['source'])
    print("Source Distribution:", source_counts)
    return df

# Sample dataset with different sources
data = {
    'published_date': ['2024-02-01', '2024-02-02', '2024-02-03', '2024-02-04', '2024-02-05', '2024-02-06'],
    'source': ['Bloomberg', 'Reuters', 'CNBC', 'Forbes', 'Unknown', 'YahooFinance'],
    'summary': [
        "Stock prices surged after strong earnings report.",
        "Market remains stable despite geopolitical tensions.",
        "Investors optimistic about the tech sector’s growth.",
        "Rumors about Apple launching a new product soon.",
        "A random blog claims market crash incoming!",
        "Crypto community is discussing Bitcoin's price surge."
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Apply credibility scoring function
df = detect_bias(df)




Source Distribution: Counter({'Bloomberg': 1, 'Reuters': 1, 'CNBC': 1, 'Forbes': 1, 'Unknown': 1, 'YahooFinance': 1})


Contextual Sentiment Analysis

In [20]:
from transformers import pipeline

# Initialize the contextual sentiment analysis model
contextual_analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")

def contextual_sentiment_analysis(text):
    if not text or not isinstance(text, str):  # Handle empty and non-string inputs
        return "Neutral", 0.0
    result = contextual_analyzer(text)
    sentiment_label = result[0]['label']
    confidence = result[0]['score']
    return sentiment_label, confidence

# Test cases
test_cases = [
    "Apple stock surges 10% after record-breaking earnings report.",  # Positive
    "The stock market remains stable despite recent fluctuations.",  # Neutral/Positive
    "Tesla shares plummet 15% due to declining revenue and supply chain issues.",  # Negative
    "Gold prices remain unchanged amid mixed economic data.",  # Neutral
    "Some analysts predict a possible stock market crash.",  # Neutral/Negative
    "Rumors suggest the Federal Reserve might cut interest rates soon.",  # Neutral
    "Despite poor earnings, the company expects strong growth next year.",  # Neutral
    "NASDAQ sees a bullish trend as S&P 500 reaches a new high.",  # Positive
    "I love hiking in the mountains during weekends.",  # Non-financial, should be Neutral
    "",  # Edge case: Empty input
    "Profits",  # Edge case: Single word
    "Stock price up 3.5%!!! 🔥🔥🔥",  # Handling special characters and numbers
    "Las acciones de Tesla suben un 5%."  # Non-English text
]

# Run tests
print("\n=== Sentiment Analysis Test Cases ===")
for i, text in enumerate(test_cases):
    sentiment, confidence = contextual_sentiment_analysis(text)
    print(f"Test {i+1}: {text}")
    print(f"   → Sentiment: {sentiment}, Confidence: {confidence:.4f}\n")


Device set to use cpu



=== Sentiment Analysis Test Cases ===
Test 1: Apple stock surges 10% after record-breaking earnings report.
   → Sentiment: positive, Confidence: 0.8603

Test 2: The stock market remains stable despite recent fluctuations.
   → Sentiment: positive, Confidence: 0.8533

Test 3: Tesla shares plummet 15% due to declining revenue and supply chain issues.
   → Sentiment: negative, Confidence: 0.9712

Test 4: Gold prices remain unchanged amid mixed economic data.
   → Sentiment: negative, Confidence: 0.6643

Test 5: Some analysts predict a possible stock market crash.
   → Sentiment: negative, Confidence: 0.9347

Test 6: Rumors suggest the Federal Reserve might cut interest rates soon.
   → Sentiment: negative, Confidence: 0.8753

Test 7: Despite poor earnings, the company expects strong growth next year.
   → Sentiment: positive, Confidence: 0.9344

Test 8: NASDAQ sees a bullish trend as S&P 500 reaches a new high.
   → Sentiment: positive, Confidence: 0.8942

Test 9: I love hiking in the m

Extracting numerical data


In [21]:
def extract_numerical_data(text):
    return re.findall(r'\d+\.\d+%?|\$\d+(?:,\d{3})*(?:\.\d{1,2})?', text)


In [22]:


# Test cases
test_cases = [
    "Inflation rose by 3.5% in Q1.",
    "Apple's revenue hit $2,500,000 last year.",
    "Stock increased by 4.2% after a $1.5M buyback.",
    "The stock fell 0.85% after poor earnings.",
    "No major price movements were observed.",
    "Tesla stock jumped 5.75% after reporting $25.3 billion in revenue.",
    "Amazon's profit reached $1,250,000.50 this quarter.",
    "GDP grew 2.9% last quarter.",
    "New contract worth $50k is expected.",
    "Market fell by -1.2% today."
]

# Running the test cases
print("\n=== Extracted Numerical Data ===")
for i, text in enumerate(test_cases):
    result = extract_numerical_data(text)
    print(f"Test {i+1}: {text}")
    print(f"   → Extracted Data: {result}\n")



=== Extracted Numerical Data ===
Test 1: Inflation rose by 3.5% in Q1.
   → Extracted Data: ['3.5%']

Test 2: Apple's revenue hit $2,500,000 last year.
   → Extracted Data: ['$2,500,000']

Test 3: Stock increased by 4.2% after a $1.5M buyback.
   → Extracted Data: ['4.2%', '$1.5']

Test 4: The stock fell 0.85% after poor earnings.
   → Extracted Data: ['0.85%']

Test 5: No major price movements were observed.
   → Extracted Data: []

Test 6: Tesla stock jumped 5.75% after reporting $25.3 billion in revenue.
   → Extracted Data: ['5.75%', '$25.3']

Test 7: Amazon's profit reached $1,250,000.50 this quarter.
   → Extracted Data: ['$1,250,000.50']

Test 8: GDP grew 2.9% last quarter.
   → Extracted Data: ['2.9%']

Test 9: New contract worth $50k is expected.
   → Extracted Data: ['$50']

Test 10: Market fell by -1.2% today.
   → Extracted Data: ['1.2%']



Fetch news from source

In [24]:
# Alpha Vantage API Configuration
API_KEY = os.environ.get('ALPHA_KEY')
BASE_URL = 'https://www.alphavantage.co/query'

In [26]:
# Step 12: Fetch Financial News from Alpha Vantage
def fetch_alpha_vantage_news():
    params = {
        'function': 'NEWS_SENTIMENT',
        'apikey': API_KEY,
        'topics': 'technology, finance, energy',
        'sort': 'LATEST',
        'limit': 50
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error:", response.status_code, response.text)
        return None

# Step 13: Fetch Financial News from Yahoo Finance
def fetch_yahoo_finance_news():
    yahoo_news_url = 'https://query1.finance.yahoo.com/v7/finance/news'
    params = {'category': 'technology, finance, energy', 'count': 50}
    response = requests.get(yahoo_news_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error:", response.status_code, response.text)
        return None

# Main Execution: Fetch Data from Both Alpha Vantage and Yahoo Finance
alpha_vantage_data = fetch_alpha_vantage_news()
yahoo_finance_data = fetch_yahoo_finance_news()


In [27]:
yahoo_finance_data = fetch_yahoo_finance_news()

Error: 429 Too Many Requests



Duplicate Content Detection

In [None]:

def remove_duplicates(df):
    df['summary_hash'] = df['summary'].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
    df.drop_duplicates(subset=['summary_hash'], keep='first', inplace=True)
    return df

Identify Near-Duplicates

In [None]:
def identify_near_duplicates(df, threshold=0.9):
    unique_articles = []
    for idx, row in df.iterrows():
        is_duplicate = False
        for article in unique_articles:
            similarity = difflib.SequenceMatcher(None, row['summary'], article['summary']).ratio()
            if similarity > threshold:
                is_duplicate = True
                break
        if not is_duplicate:
            unique_articles.append(row)

    # return the dataframe of unique articles
    return pd.DataFrame(unique_articles)

Context Preservation During Cleaning

In [None]:
def preserve_context(text):
    context_terms = {
        'stock': 'equity',
        'bullish': 'positive market sentiment',
        'bearish': 'negative market sentiment'
    }
    for term, replacement in context_terms.items():
        text = text.replace(term, replacement)
    return text

Maintain Semantic Relationships

In [None]:
def maintain_semantics(text):
    semantic_terms = {
        'increase': 'rise',
        'growth': 'rise',
        'decrease': 'fall',
        'decline': 'fall'
    }
    for term, replacement in semantic_terms.items():
        text = text.replace(term, replacement)
    return text

Handling Ambiguous Content

In [None]:
def handle_ambiguity(text):
    ambiguous_terms = {
        'bullish': 'positive market sentiment',
        'bearish': 'negative market sentiment',
        'volatile': 'unstable market'
    }
    for term, replacement in ambiguous_terms.items():
        text = text.replace(term, replacement)
    return text

Preprocessing Pipeline

In [None]:
# Pipeline for Financial News Data
def preprocess_news_data(news_data):
    articles = news_data.get('feed', [])
    cleaned_data = []

    for article in articles:
        title = clean_text(article.get('title', ''))
        summary = clean_text(article.get('summary', ''))
        source = article.get('source', 'Unknown')
        sentiment_score = sia.polarity_scores(summary)['compound']

        # Use contextual sentiment analysis to determine sentiment
        contextual_sentiment_label, confidence = contextual_sentiment_analysis(summary)

        # Choose sentiment from context or default to VADER-based sentiment
        verified_sentiment = contextual_sentiment_label if confidence > 0.7 else "Neutral"

        # Extract numerical data (e.g., stock prices, percentages)
        numerical_data = extract_numerical_data(summary)

        # Calculate credibility score based on the source
        credibility_score = SOURCE_CREDIBILITY.get(source, 2)

        # Step 3: Context Preservation
        summary = preserve_context(summary)
        summary = maintain_semantics(summary)  # Step 4: Maintain Semantic Relationships
        summary = handle_ambiguity(summary)  # Step 5: Handle Ambiguity

        cleaned_data.append({
            'title': title,
            'summary': summary,
            'source': source,
            'credibility_score': credibility_score,
            'published_date': article.get('time_published', 'Unknown'),
            'topics': ", ".join([topic.get('name', '') for topic in article.get('topics', []) if isinstance(topic, dict)]),
            'alpha_vantage_sentiment': article.get('overall_sentiment_label', 'Neutral'),
            'verified_sentiment': verified_sentiment,
            'confidence': confidence,
            'extracted_numerical_data': numerical_data
        })

    # Convert to DataFrame for further analysis
    df = pd.DataFrame(cleaned_data)

    # Remove duplicates and near-duplicates
    df = remove_duplicates(df)
    df = identify_near_duplicates(df)

    # Handle missing data and mitigate bias
    df = impute_missing_data(df)
    df = detect_bias(df)

    return df

Combine data

In [None]:

# Combine both datasets
if alpha_vantage_data and yahoo_finance_data:
    combined_data = alpha_vantage_data['feed'] + yahoo_finance_data['items']
    cleaned_df = preprocess_news_data({'feed': combined_data})
    # Display the cleaned data
    print("Cleaned Data Sample:")
    print(cleaned_df.head())
    # Save the cleaned data to CSV
    cleaned_df.to_csv('cleaned_financial_news_combined.csv', index=False)
