### This workflow includes more advanced steps like calculating weighted sentiment and analyzing rolling correlations, which are suitable for a technical report.

In [None]:
# --- Imports ---
import yfinance as yf
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np

# --- Download NLTK Data (if needed) ---
nltk.download('vader_lexicon')

# --- Configuration ---
COMPANY_TICKER = "YYGH"
COMPANY_NAME = "YY Group Holding Ltd"
INPUT_FILE = "posts.csv"  # Switched back to CSV

# --- Initialize Tools ---
sid = SentimentIntensityAnalyzer()

print("Configuration and setup complete.")

Configuration and setup complete.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Yangu\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Data Ingestion from Plain Text File

In [23]:
# --- Load Posts from the plain text file ---
posts_data = []
try:
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue # Skip empty lines

            parts = line.split('|')
            try:
                if len(parts) == 3: # Format: YYYY-MM-DD | score | title
                    date_str, score_str, title = parts
                    score = int(score_str.strip())
                elif len(parts) == 2: # Format: YYYY-MM-DD | title
                    date_str, title = parts
                    score = 1 # Assign a default score of 1
                else:
                    print(f"SKIPPING MALFORMED LINE: Line does not have 2 or 3 parts -> {line}")
                    continue

                posts_data.append({
                    'created_date': pd.to_datetime(date_str.strip()).date(),
                    'title': title.strip(),
                    'score': score
                })
            except (ValueError, IndexError) as e:
                print(f"COULD NOT PARSE LINE: {line}. Error: {e}")

    df_posts = pd.DataFrame(posts_data)
    
    # --- 💡 NEW: VALIDATION STEP ---
    if df_posts.empty or 'title' not in df_posts.columns:
        print("\n-------------------------------------------------------------")
        print("ERROR: Failed to create DataFrame. It is either empty or missing the 'title' column.")
        print("Please check your posts.txt file for formatting errors and review any 'SKIPPING' messages above.")
        print("-------------------------------------------------------------")
    else:
        print(f"Successfully loaded and parsed {len(df_posts)} posts from {INPUT_FILE}.")
        print("\nDataFrame Columns:", df_posts.columns.tolist())
        print("DataFrame Head:")
        print(df_posts.head())

except FileNotFoundError:
    print(f"ERROR: The file '{INPUT_FILE}' was not found. Please make sure it's in the same directory as your notebook.")


-------------------------------------------------------------
ERROR: Failed to create DataFrame. It is either empty or missing the 'title' column.
Please check your posts.txt file for formatting errors and review any 'SKIPPING' messages above.
-------------------------------------------------------------


### 2.0 Methodology

#### 2.1 Data Sources
-   **Social Media Data:** Posts were analyzed from a pre-compiled dataset contained in a plain text file (`posts.txt`). The dataset consists of post titles, their creation dates, and upvote scores, formatted with a pipe `|` delimiter.
-   **Financial Data:** Historical daily stock data...

In [24]:
# --- Calculate VADER Sentiment ---
# This applies the sentiment analyzer to each title in the 'title' column.
df_posts['sentiment_score'] = df_posts['title'].apply(lambda title: sid.polarity_scores(title)['compound'])

# --- Feature Engineering: Weighted Sentiment ---
# We multiply the sentiment by the post's score to give it more weight.
# We add 1 to the score to prevent multiplying by zero for posts with a score of 0.
df_posts['sentiment_weighted'] = df_posts['sentiment_score'] * (df_posts['score'] + 1)

print("Sentiment analysis and feature engineering complete.")
# Display the results to verify
df_posts[['title', 'score', 'sentiment_score', 'sentiment_weighted']].head()

KeyError: 'title'