In [None]:
import requests
import pandas as pd
import os
from datetime import datetime

# Set your Alpha Vantage API key here
API_KEY = 'MMK7S1NB70W7U85D'

# Function to get sentiment data for a specific stock
def get_sentiment_data(symbol):
    url = f'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers={symbol}&apikey={API_KEY}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'feed' in data:
            return data['feed']
        else:
            print(f"No sentiment data available for {symbol}.")
            return []
    else:
        print(f"Failed to fetch data for {symbol}. HTTP Status code: {response.status_code}")
        return []

# Function to process sentiment data and save to CSV
def save_sentiment_to_csv(symbol, sentiment_data, directory='./sentiment_data/'):
    if not sentiment_data:
        return
    
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Extract relevant information from the sentiment data
    processed_data = []
    for item in sentiment_data:
        processed_data.append({
            'symbol': symbol,
            'title': item.get('title', ''),
            'summary': item.get('summary', ''),
            'url': item.get('url', ''),
            'time_published': item.get('time_published', ''),
            'sentiment_score': item.get('overall_sentiment_score', ''),
            'source': item.get('source', '')
        })

    # Convert to DataFrame
    df = pd.DataFrame(processed_data)

    # Create a filename based on the symbol and current date
    filename = f"{directory}{symbol}_sentiment_{datetime.now().strftime('%Y-%m-%d')}.csv"
    
    # Save DataFrame to CSV
    df.to_csv(filename, index=False)
    print(f"Saved sentiment data to {filename}")

# List of stock symbols to fetch sentiment data for
symbols = ['VFIAX', 'VSMPX', 'AGTHX', 'OXY', 'XOM', 'VRTX', 'NVDA', 'GOOG', 'AMZN']

# Fetch and save sentiment data for each symbol
for symbol in symbols:
    sentiment_data = get_sentiment_data(symbol)
    save_sentiment_to_csv(symbol, sentiment_data)

In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime
import os

def download_stock_data(tickers, start_date='2010-01-01'):
    data = {}
    for ticker in tickers:
        stock_data = yf.download(ticker, start=start_date)
        print(f"Downloaded data for {ticker}")  # Add this line
        data[ticker] = stock_data
    return data

def save_data_to_csv(data, directory='./stock_data/'):
    if not os.path.exists(directory):
        os.makedirs(directory)
    for ticker, df in data.items():
        filename = f"{directory}{ticker}_{datetime.now().strftime('%Y-%m-%d')}.csv"
        df.to_csv(filename)
        print(f"Saved data to {filename}")  # Add this line

tickers = ['VFIAX', 'VSMPX', 'AGTHX', 'OXY', 'XOM', 'VRTX', 'NVDA', 'GOOG', 'AMZN']  # Example tickers
data = download_stock_data(tickers)
save_data_to_csv(data)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Downloaded data for VFIAX
Downloaded data for VSMPX


[*********************100%%**********************]  1 of 1 completed


Downloaded data for AGTHX


[*********************100%%**********************]  1 of 1 completed


Downloaded data for OXY


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Downloaded data for XOM
Downloaded data for VRTX


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Downloaded data for NVDA
Downloaded data for GOOG


[*********************100%%**********************]  1 of 1 completed


Downloaded data for AMZN
Saved data to ./stock_data/VFIAX_2024-08-25.csv
Saved data to ./stock_data/VSMPX_2024-08-25.csv
Saved data to ./stock_data/AGTHX_2024-08-25.csv
Saved data to ./stock_data/OXY_2024-08-25.csv
Saved data to ./stock_data/XOM_2024-08-25.csv
Saved data to ./stock_data/VRTX_2024-08-25.csv
Saved data to ./stock_data/NVDA_2024-08-25.csv
Saved data to ./stock_data/GOOG_2024-08-25.csv
Saved data to ./stock_data/AMZN_2024-08-25.csv


In [6]:
import pandas as pd

# Define the directory where your files are located
directory = '/Users/arnavboppudi/Desktop/stockbot/stock_data/'

# List of mutual fund files
mutual_fund_files = [
    'AGTHX_2024-08-25.csv',
    'VFIAX_2024-08-25.csv',
    'VSMPX_2024-08-25.csv'
]

# Read the CSV files into DataFrames
mutual_funds_data = {file.split('_')[0]: pd.read_csv(f'{directory}{file}') for file in mutual_fund_files}

# Display the first few rows of each mutual fund's data
for fund, df in mutual_funds_data.items():
    print(f"\nData for {fund}:")
    print(df.head())



Data for AGTHX:
         Date       Open       High        Low      Close  Adj Close  Volume
0  2010-01-04  27.790001  27.790001  27.790001  27.790001  12.156005       0
1  2010-01-05  27.860001  27.860001  27.860001  27.860001  12.186625       0
2  2010-01-06  27.910000  27.910000  27.910000  27.910000  12.208495       0
3  2010-01-07  27.920000  27.920000  27.920000  27.920000  12.212871       0
4  2010-01-08  28.090000  28.090000  28.090000  28.090000  12.287233       0

Data for VFIAX:
         Date        Open        High         Low       Close  Adj Close  \
0  2010-01-04  104.320000  104.320000  104.320000  104.320000  79.453156   
1  2010-01-05  104.650002  104.650002  104.650002  104.650002  79.704498   
2  2010-01-06  104.750000  104.750000  104.750000  104.750000  79.780685   
3  2010-01-07  105.180000  105.180000  105.180000  105.180000  80.108162   
4  2010-01-08  105.480003  105.480003  105.480003  105.480003  80.336678   

   Volume  
0       0  
1       0  
2       0  

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import plotly.graph_objs as go

# Function to determine if a given day is a trading day (Monday to Friday)
def is_trading_day(date):
    return date.weekday() < 5  # Monday=0, Sunday=6

# Train a linear regression model for each mutual fund
models = {}
for fund, df in mutual_funds_data.items():
    # Calculate daily returns as a feature
    df['Daily Return'] = df['Close'].pct_change().fillna(0)

    # Select features and target variable
    X = df[['Open', 'High', 'Low', 'Volume', 'Daily Return']]
    y = df['Close']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Store the trained model
    models[fund] = model
    
    # Evaluate the model using Mean Squared Error (MSE)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    print(f"Training Mean Squared Error for {fund}: {train_mse}")
    print(f"Testing Mean Squared Error for {fund}: {test_mse}")

    # Store predictions back into the DataFrame for plotting
    df['Predicted Close'] = model.predict(X)
    
    # Get the latest data point (i.e., Friday's data)
    latest_data = df[df['Date'].apply(is_trading_day)].iloc[-1]

    # Create a new DataFrame row for the prediction (representing Monday's features)
    tom_features = {
        'Open': latest_data['Close'] * (1 + np.random.normal(0, 0.01)),  # Slight random variation
        'High': latest_data['High'] * (1 + np.random.normal(0, 0.01)),
        'Low': latest_data['Low'] * (1 + np.random.normal(0, 0.01)),
        'Volume': latest_data['Volume'],  # Assume same volume
        'Daily Return': latest_data['Daily Return']  # Use Friday's return as an estimate
    }

    # Convert to DataFrame
    tom_df = pd.DataFrame([tom_features])

    # Predict Monday's price using the trained model
    tom_price = models[fund].predict(tom_df)[0]
    
    # Create a new row for the prediction
    tom_row = pd.DataFrame({
        'Date': [latest_data['Date'] + pd.Timedelta(days=3)],
        'Close': [np.nan],  
        'Predicted Close': [tom_price]
    })
    
    # Concatenate the new row to the original DataFrame
    df = pd.concat([df, tom_row], ignore_index=True)
    
    print(f"\nPredicted Close Price for {fund} Tomorrow): {tom_price}")

    # Filter the DataFrame to include only the last 10 days
    df_last_ten_days = df.tail(11)

    # Create the plot using Plotly
    fig = go.Figure()

    # Actual Close Prices
    fig.add_trace(go.Scatter(x=df_last_ten_days['Date'], y=df_last_ten_days['Close'], mode='lines', name='Actual Close', line=dict(color='blue')))
    
    # Predicted Close Prices
    fig.add_trace(go.Scatter(x=df_last_ten_days['Date'], y=df_last_ten_days['Predicted Close'], mode='lines', name='Predicted Close (Train)', line=dict(color='green', dash='dash')))
    
    # Predicted Close Price for Monday
    fig.add_trace(go.Scatter(x=[df_last_ten_days.iloc[-1]['Date']], y=[df_last_ten_days.iloc[-1]['Predicted Close']], mode='markers', name='Predicted Close (Monday)', marker=dict(color='red', size=10)))

    # Update layout
    fig.update_layout(
        title=f'{fund} Actual vs Predicted Close Prices (Last 10 Days)',
        xaxis_title='Date',
        yaxis_title='Close Price',
        hovermode='x unified'
    )

    # Show interactive plot
    fig.show()


Training Mean Squared Error for AGTHX: 3.37337045982977e-29
Testing Mean Squared Error for AGTHX: 3.3481097849814447e-29

Predicted Close Price for AGTHX Tomorrow): 75.33790963622661


Training Mean Squared Error for VFIAX: 2.4404380785670378e-27
Testing Mean Squared Error for VFIAX: 2.4157083060375587e-27

Predicted Close Price for VFIAX Tomorrow): 521.0035510425308


Training Mean Squared Error for VSMPX: 3.3625805162954347e-28
Testing Mean Squared Error for VSMPX: 3.3128129793383497e-28

Predicted Close Price for VSMPX Tomorrow): 253.36472382634923


## PREDICTING TOMORROW's STOCK PRICE WITH SENTIMENT SCORE AS ANOTHER FEATURE

In [24]:
import pandas as pd

# Define paths for the directories
historical_data_directory = '/Users/arnavboppudi/Desktop/stockbot/stock_data/'
sentiment_data_directory = '/Users/arnavboppudi/Desktop/stockbot/sentiment_data/'

# Load historical price data
historical_data_files = {
    'OXY': 'OXY_2024-08-25.csv',
    'XOM': 'XOM_2024-08-25.csv',
    'VRTX': 'VRTX_2024-08-25.csv',
    'NVDA': 'NVDA_2024-08-25.csv',
    'GOOG': 'GOOG_2024-08-25.csv',
    'AMZN': 'AMZN_2024-08-25.csv'
}

historical_data = {symbol: pd.read_csv(f'{historical_data_directory}{file}') for symbol, file in historical_data_files.items()}

# Load sentiment data
sentiment_data_files = {
    'OXY': 'OXY_sentiment_2024-08-25.csv',
    'XOM': 'XOM_sentiment_2024-08-25.csv',
    'VRTX': 'VRTX_sentiment_2024-08-25.csv',
    'NVDA': 'NVDA_sentiment_2024-08-25.csv',
    'GOOG': 'GOOG_sentiment_2024-08-25.csv',
    'AMZN': 'AMZN_sentiment_2024-08-25.csv'
}

sentiment_data = {symbol: pd.read_csv(f'{sentiment_data_directory}{file}') for symbol, file in sentiment_data_files.items()}


In [25]:
# Preprocess and merge data
for symbol in historical_data.keys():
    # Convert date columns to datetime in historical data
    historical_data[symbol]['Date'] = pd.to_datetime(historical_data[symbol]['Date'])
    
    # Convert time_published to datetime and extract the date in sentiment data
    sentiment_data[symbol]['time_published'] = pd.to_datetime(sentiment_data[symbol]['time_published'], format='%Y%m%dT%H%M%S')
    sentiment_data[symbol]['Date'] = sentiment_data[symbol]['time_published'].dt.date
    
    # Convert the 'Date' in sentiment data to datetime to match the type
    sentiment_data[symbol]['Date'] = pd.to_datetime(sentiment_data[symbol]['Date'])
    
    # Aggregate sentiment scores by date
    daily_sentiment = sentiment_data[symbol].groupby('Date')['sentiment_score'].mean().reset_index()
    
    # Merge historical data with sentiment data
    historical_data[symbol] = pd.merge(historical_data[symbol], daily_sentiment, on='Date', how='left')
    
    # Fill missing sentiment scores with 0 (neutral sentiment)
    historical_data[symbol]['sentiment_score'].fillna(0, inplace=True)
    
    # Calculate daily return as a feature
    historical_data[symbol]['Daily Return'] = historical_data[symbol]['Close'].pct_change().fillna(0)

    # Display the first few rows to check the merged data
    print(f"Data for {symbol}:")
    print(historical_data[symbol].head())

Data for OXY:
        Date       Open       High        Low      Close  Adj Close   Volume  \
0 2010-01-04  79.193359  80.007706  78.752647  79.614906  52.235336  3630352   
1 2010-01-05  79.691551  79.739449  78.589775  79.040070  51.858200  3515432   
2 2010-01-06  78.944260  80.352608  78.398163  79.988548  52.480473  5578661   
3 2010-01-07  79.471191  80.122681  78.915520  79.394547  52.090763  4300559   
4 2010-01-08  79.059227  80.266388  78.110748  80.208900  52.625050  4336465   

   sentiment_score  Daily Return  
0              0.0      0.000000  
1              0.0     -0.007220  
2              0.0      0.012000  
3              0.0     -0.007426  
4              0.0      0.010257  
Data for XOM:
        Date       Open       High        Low      Close  Adj Close    Volume  \
0 2010-01-04  68.720001  69.260002  68.190002  69.150002  39.594810  27809100   
1 2010-01-05  69.190002  69.449997  68.800003  69.419998  39.749409  30174700   
2 2010-01-06  69.449997  70.599998  69

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train a linear regression model for each stock
models = {}
for symbol, df in historical_data.items():
    # Select features and target variable
    X = df[['Open', 'High', 'Low', 'Volume', 'Daily Return', 'sentiment_score']]
    y = df['Close']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Store the trained model
    models[symbol] = model
    
    # Evaluate the model using Mean Squared Error (MSE)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    print(f"Training Mean Squared Error for {symbol}: {train_mse}")
    print(f"Testing Mean Squared Error for {symbol}: {test_mse}")


Training Mean Squared Error for OXY: 0.18628596414727697
Testing Mean Squared Error for OXY: 0.19804587444390256
Training Mean Squared Error for XOM: 0.14294784455959306
Testing Mean Squared Error for XOM: 0.1523775447467386
Training Mean Squared Error for VRTX: 1.7922628957951112
Testing Mean Squared Error for VRTX: 1.702316959851453
Training Mean Squared Error for NVDA: 0.0618193243777418
Testing Mean Squared Error for NVDA: 0.09687544844006499
Training Mean Squared Error for GOOG: 0.1756208264260129
Testing Mean Squared Error for GOOG: 0.16645953269055977
Training Mean Squared Error for AMZN: 0.3574798920448992
Testing Mean Squared Error for AMZN: 0.3277699841170581


In [27]:
import matplotlib.pyplot as plt
import numpy as np

# Predict tomorrow's price for each stock
for symbol, df in historical_data.items():
    # Get the latest data point
    latest_data = df.iloc[-1]
    
    # Prepare features for tomorrow's prediction
    tomorrow_features = {
        'Open': latest_data['Close'],  # Assume Open is same as today's Close
        'High': latest_data['Close'],  # Assume High is same as today's Close
        'Low': latest_data['Close'],   # Assume Low is same as today's Close
        'Volume': latest_data['Volume'],  # Same volume
        'Daily Return': 0,  # Assume no change in price
        'sentiment_score': latest_data['sentiment_score']  # Use latest sentiment score
    }
    
    tomorrow_df = pd.DataFrame([tomorrow_features])
    
    # Predict tomorrow's price
    tomorrow_price = models[symbol].predict(tomorrow_df)[0]
    
    print(f"\nPredicted Close Price for {symbol} tomorrow: {tomorrow_price}")

    # Append the prediction for tomorrow
    tomorrow_row = pd.DataFrame({
        'Date': [latest_data['Date'] + pd.Timedelta(days=1)],
        'Close': [np.nan],
        'Predicted Close': [tomorrow_price]
    })
    
    df = pd.concat([df, tomorrow_row], ignore_index=True)
    
    # Create interactive plot using Plotly
    fig = go.Figure()

    # Actual Close Prices
    fig.add_trace(go.Scatter(x=df['Date'], y=df['Close'], mode='lines', name='Actual Close', line=dict(color='blue')))
    
    # Predicted Close Prices
    fig.add_trace(go.Scatter(x=df['Date'], y=df['Predicted Close'], mode='lines', name='Predicted Close (Train)', line=dict(color='green', dash='dash')))
    
    # Predicted Close Price for Tomorrow
    fig.add_trace(go.Scatter(x=[df.iloc[-1]['Date']], y=[df.iloc[-1]['Predicted Close']], mode='markers', name='Predicted Close (Tomorrow)', marker=dict(color='red', size=10)))

    # Update layout
    fig.update_layout(
        title=f'{symbol} Actual vs Predicted Close Prices',
        xaxis_title='Date',
        yaxis_title='Close Price',
        hovermode='x unified'
    )

    # Show interactive plot
    fig.show()



Predicted Close Price for OXY tomorrow: 57.26963492310479



Predicted Close Price for XOM tomorrow: 115.93917783961743



Predicted Close Price for VRTX tomorrow: 480.8513278670829



Predicted Close Price for NVDA tomorrow: 130.6124596768802



Predicted Close Price for GOOG tomorrow: 167.5137920418947



Predicted Close Price for AMZN tomorrow: 176.50272820344475


In [30]:
print(f"Data for {symbol}:")
historical_data[symbol].tail()

Data for AMZN:


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_score,Daily Return
3680,2024-08-19,177.639999,178.300003,176.160004,178.220001,178.220001,31129800,0.0,0.006551
3681,2024-08-20,177.919998,179.009995,177.429993,178.880005,178.880005,26255200,0.0,0.003703
3682,2024-08-21,179.919998,182.389999,178.889999,180.110001,180.110001,35599100,0.221132,0.006876
3683,2024-08-22,181.380005,181.470001,175.679993,176.130005,176.130005,32047500,0.181826,-0.022098
3684,2024-08-23,177.339996,178.970001,175.240005,177.039993,177.039993,29095000,0.205642,0.005167
