In [1]:
# %% [markdown]
# # NVIDIA Stock & News Analysis
# 
# ## Complete Analytical Workflow

# %% [code]
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# %% [code]
# Load Data
df = pd.read_csv('../data/nvidia_news_analysis.csv', parse_dates=['date'])
print("Data Shape:", df.shape)
df.head()

# %% [code]
# %% [markdown]
# ## Exploratory Data Analysis

# %% [code]
# Basic Statistics
print("Descriptive Stats:")
display(df[['sentiment_textblob', 'sentiment_finbert', 'price_change']].describe())

# Date Range
print(f"\nDate Range: {df['date'].min().date()} to {df['date'].max().date()}")

# %% [code]
# Correlation Analysis
corr_matrix = df[['sentiment_textblob', 'sentiment_finbert', 'price_change']].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# %% [code]
# %% [markdown]
# ## Time Series Visualization

# %% [code]
# Configure plot
plt.figure(figsize=(14, 8))
ax1 = plt.gca()
ax2 = ax1.twinx()

# Sentiment Bars
sns.barplot(x=df['date'], y=df['sentiment_finbert'], ax=ax1, alpha=0.3, color='blue')
ax1.set_ylabel('FinBERT Sentiment', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.set_xticklabels(df['date'].dt.strftime('%Y-%m-%d'), rotation=45)

# Price Change Line
sns.lineplot(x=df['date'], y=df['price_change'], ax=ax2, color='red', marker='o')
ax2.set_ylabel('Price Change (%)', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title('NVIDIA Stock Analysis: Sentiment vs Price Movement')
plt.tight_layout()
plt.show()

# %% [code]
# %% [markdown]
# ## Machine Learning Prediction

# %% [code]
# Prepare data
df['target'] = (df['price_change'].shift(-1) > 0).astype(int)
df.dropna(inplace=True)

X = df[['sentiment_finbert', 'sentiment_textblob', 'price_change']]
y = df['target']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False, random_state=42
)

# Model Training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluation
predictions = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions))

# Feature Importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values('Coefficient', ascending=False)
display(importance)

# %% [code]
# %% [markdown]
# ## Advanced Visualization

# %% [code]
# Sentiment Distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['sentiment_textblob'], bins=20, kde=True)
plt.title('TextBlob Sentiment Distribution')

plt.subplot(1, 2, 2)
sns.histplot(df['sentiment_finbert'], bins=20, kde=True)
plt.title('FinBERT Sentiment Distribution')
plt.tight_layout()
plt.show()

# %% [code]
# Lag Analysis
lags = pd.DataFrame({
    'Sentiment': df['sentiment_finbert'],
    'Price Change (+1)': df['price_change'].shift(-1),
    'Price Change (+2)': df['price_change'].shift(-2)
}).corr()

plt.figure(figsize=(8, 6))
sns.heatmap(lags, annot=True, cmap='coolwarm')
plt.title('Lag Correlation Analysis')
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '../data/nvidia_full_dataset.csv'