In [None]:
!pip install textblob


In [None]:

import pandas as pd
from textblob import TextBlob
from scipy.stats import pearsonr
import os


In [None]:

# Step 1: Load Data
def load_stock_data(file_path):
    """Load and prepare stock price data."""
    df = pd.read_csv(file_path)
    # df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df.set_index('Date', inplace=True)
    df['Daily_Return'] = df['Close'].pct_change()  # Calculate daily returns
    return df[['Daily_Return']]


In [None]:

def load_news_data(file_path):
    """Load and prepare news data."""
    df = pd.read_csv(file_path)
    # df['date'] = pd.to_datetime(df['date']).dt.date  # Normalize to date only
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    return df[['date', 'headline']]


In [None]:

# Step 2: Perform Sentiment Analysis
def calculate_sentiment(news_df):
    """Assign sentiment scores to news headlines."""
    def get_sentiment_score(text):
        return TextBlob(text).sentiment.polarity

    news_df['Sentiment'] = news_df['headline'].apply(get_sentiment_score)
    return news_df


In [None]:

# Step 3: Aggregate Daily Sentiment Scores
def aggregate_sentiments(news_df):
    """Compute average sentiment scores for each day."""
    aggregated = news_df.groupby('date').agg({'Sentiment': 'mean'}).reset_index()
    # aggregated['date'] = pd.to_datetime(aggregated['date'])
    aggregated['date'] = pd.to_datetime(aggregated['date'], errors='coerce')

    return aggregated.set_index('date')


In [None]:

# Step 4: Align Stock and News Data
def align_data(stock_df, sentiment_df):
    """Merge stock and sentiment data by date."""
    return stock_df.join(sentiment_df, how='inner')


In [None]:

# Step 5: Calculate Correlation
def calculate_correlation(df):
    """Compute correlation between stock returns and sentiment."""
    correlation, p_value = pearsonr(df['Daily_Return'].dropna(), df['Sentiment'].dropna())
    return correlation, p_value


In [None]:

# Step 6: Process Data and Analysis
def process_correlation_analysis(stock_dir, news_file):
    """Perform the entire analysis for all stock files and a news file."""
    news_df = load_news_data(news_file)
    news_df = calculate_sentiment(news_df)
    aggregated_sentiments = aggregate_sentiments(news_df)

    results = []
    for stock_file in os.listdir(stock_dir):
        if stock_file.endswith('.csv'):
            stock_path = os.path.join(stock_dir, stock_file)
            stock_df = load_stock_data(stock_path)
            aligned_df = align_data(stock_df, aggregated_sentiments)

            correlation, p_value = calculate_correlation(aligned_df)
            stock_name = os.path.basename(stock_file).split('_')[0]
            results.append({'Stock': stock_name, 'Correlation': correlation, 'P-Value': p_value})
    
    return pd.DataFrame(results)


In [None]:

# Step 7: Main Workflow
if __name__ == "__main__":
    # Define paths
    stock_directory = '../Data/yfinance_data/' # Directory containing stock data
    news_file_path = '../Data/raw_analyst_ratings.csv'  # Example: Upload the news data as a single file

    # Perform analysis
    result_df = process_correlation_analysis(stock_directory, news_file_path)
    print(result_df)
    # result_df.to_csv('/mnt/data/correlation_results.csv', index=False)
