In [5]:

import requests
import csv

# Your Alpha Vantage API key
api_key = '5SRJ25KFE2K282K0'

# Function to fetch data for a given stock symbol
def fetch_stock_data(symbol):
    url = 'https://www.alphavantage.co/query'
    params = {
        'function': 'TIME_SERIES_DAILY',
        'symbol': symbol,
        'apikey': api_key
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    
    if "Time Series (Daily)" in data:
        return data['Time Series (Daily)']
    else:
        print(f"Error fetching data for {symbol}: {data.get('Note', 'No data found')}")
        return None

# List of stock symbols
stock_symbols = ['AAPL', 'TSLA', 'JPM', 'PFE', 'XOM']

# Open a new CSV file for writing
with open('stock_data.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(['Symbol', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume'])
    
    # Fetch data for each symbol and write to CSV
    for symbol in stock_symbols:
        stock_data = fetch_stock_data(symbol)
        if stock_data:
            for date, daily_data in stock_data.items():
                writer.writerow([
                    symbol,
                    date,
                    daily_data['1. open'],
                    daily_data['2. high'],
                    daily_data['3. low'],
                    daily_data['4. close'],
                    daily_data['5. volume']
                ])

print("Stock data written to CSV file successfully.")


Stock data written to CSV file successfully.


In [6]:
import requests
import csv

# Your News API key
api_key = 'a9dd58fced9045d18b1516c1fb48b6cc'

# The endpoint for everything (general search across news sources)
url = 'https://newsapi.org/v2/everything'

# Parameters for the request
params = {
    'q': 'AAPL OR TSLA OR JPM OR PFE OR XOM',  # Keywords or phrases to search for
    'sortBy': 'relevancy',  # Sort results by relevancy, popularity, or publishedAt
    'language': 'en',  # Language of the articles
    'apiKey': api_key
}

# Make the request
response = requests.get(url, params=params)
data = response.json()

# Check for successful response
if response.status_code == 200:
    # Extract articles
    articles = data['articles']
    
    # Define the CSV file headers
    csv_headers = ['Title', 'Source', 'URL', 'Published At', 'Description']
    
    # Open a new CSV file for writing
    with open('news_articles.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow(csv_headers)
        
        # Write the article data
        for article in articles:
            writer.writerow([
                article['title'],
                article['source']['name'],
                article['url'],
                article['publishedAt'],
                article['description']
            ])
    
    print("New data written to CSV file successfully.")
else:
    print(f"Error: {data['message']}")


New data written to CSV file successfully.


In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Read the CSV file into a DataFrame
df_stock = pd.read_csv('stock_data.csv')

# Display the first few rows to ensure data is loaded correctly
print(df_stock.head())

# Fill missing values with the forward fill method
df_stock.ffill(inplace=True)

# Convert 'Date' to datetime and set it as index
df_stock['Date'] = pd.to_datetime(df_stock['Date'])
df_stock.set_index('Date', inplace=True)

# Define the window size for the moving average (20 days)
window_size = 20

# Adding previous days' prices as features
df_stock['Close_Lag1'] = df_stock['Close'].shift(1)
df_stock['Close_Lag2'] = df_stock['Close'].shift(2)

# Calculate daily return
df_stock['Daily_Return'] = df_stock['Close'].pct_change() * 100  # Multiply by 100 for percentage

# Adding lagged returns as features
df_stock['Return_Lag1'] = df_stock['Daily_Return'].shift(1)
df_stock['Return_Lag2'] = df_stock['Daily_Return'].shift(2)

# Calculate the moving average
df_stock['Moving_Avg'] = df_stock['Close'].rolling(window=window_size).mean()

# Calculate Rate of Change
df_stock['ROC'] = df_stock['Close'].pct_change(periods=12) * 100

# Extracting meaningful features for modeling stock prices
df_stock['SMA_20'] = df_stock['Close'].rolling(window=window_size).mean()
df_stock['EMA_20'] = df_stock['Close'].ewm(span=20, adjust=False).mean()

def calculate_rsi(df, period=14):
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

# Measure Relative Strength Index (RSI)
df_stock['RSI'] = calculate_rsi(df_stock)

# Calculate 12-day and 26-day EMA
df_stock['EMA_12'] = df_stock['Close'].ewm(span=12, adjust=False).mean()
df_stock['EMA_26'] = df_stock['Close'].ewm(span=26, adjust=False).mean()

# Measure Moving Average Convergence Divergence (MACD)
df_stock['MACD'] = df_stock['EMA_12'] - df_stock['EMA_26']
df_stock['MACD_Signal'] = df_stock['MACD'].ewm(span=9, adjust=False).mean()

# Measure Volatility and Volume_Mean
df_stock['Volatility'] = df_stock['Close'].rolling(window=window_size).std()
df_stock['Volume_Mean'] = df_stock['Volume'].rolling(window=window_size).mean()

# Drop rows with NaN values introduced by shifting
df_stock.dropna(inplace=True)

# Reset index to make 'Date' a column again before normalization
df_stock.reset_index(inplace=True)

# Columns to normalize
columns_to_normalize = ['Open', 'High', 'Low', 'Close', 'Volume']

# Ensure the columns to normalize exist
columns_to_normalize = [col for col in columns_to_normalize if col in df_stock.columns]

if not columns_to_normalize:
    print("No columns to normalize.")
else:
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit and transform the data
    df_stock[columns_to_normalize] = scaler.fit_transform(df_stock[columns_to_normalize])

    # Print the normalized DataFrame
    print(df_stock.head())

# Save the cleaned data to a new CSV file
df_stock.to_csv('cleaned_stock_article.csv', index=False)


  Symbol        Date    Open      High     Low   Close    Volume
0   AAPL  2024-08-01  224.30  224.4505  217.02  218.36  55541885
1   AAPL  2024-07-31  221.44  223.8200  220.63  222.08  50036262
2   AAPL  2024-07-30  219.19  220.3250  216.12  218.80  41643840
3   AAPL  2024-07-29  216.96  219.3000  215.75  218.24  36311778
4   AAPL  2024-07-26  218.70  219.4900  216.01  217.96  41601345
        Date Symbol      Open      High       Low     Close    Volume  \
0 2024-07-05   AAPL  1.187415  1.209815  1.228027  1.253106  0.377330   
1 2024-07-03   AAPL  1.162919  1.138307  1.188594  1.182101 -0.190375   
2 2024-07-02   AAPL  1.105761  1.121233  1.129445  1.163127  0.319033   
3 2024-07-01   AAPL  1.045485  1.079349  1.081584  1.110947  0.377097   
4 2024-06-28   AAPL  1.100119  1.058335  1.057202  1.020078  0.922559   

   Close_Lag1  Close_Lag2  Daily_Return  ...       ROC    SMA_20      EMA_20  \
0      227.82      228.68     -0.649636  ...  0.591085  225.0480  225.968841   
1      226.

In [8]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
# Download stopwords and punkt if not already available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the CSV file into a DataFrame (replace 'news_article.csv' with your actual file)
df_text = pd.read_csv('news_articles.csv')
# Assuming the columns with text data are named 'Title' and 'Description', replace with actual column names
text_columns = ['Title', 'Description']

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Check if the text is a string (to avoid errors with NaN or other types)
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Tokenize the text
        words = word_tokenize(text)
        # Remove stopwords and lemmatize the words
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        # Join the words back into a single string
        return ' '.join(words)
    else:
        return ''

# Apply the preprocessing function to each specified text column
for column in text_columns:
    df_text[f'Cleaned_{column}'] = df_text[column].apply(preprocess_text)
df_text.drop(columns=["URL", "Source"], inplace=True)
# Display the cleaned text columns
print(df_text[[f'Cleaned_{col}' for col in text_columns]].head())

# Save the cleaned data to a new CSV file
df_text.to_csv('cleaned_news_article.csv', index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akinahomwabella/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akinahomwabella/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akinahomwabella/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                       Cleaned_Title  \
0  exxonmobil xom reveals hammerhead project deve...   
1       tesla robotaxi wont ready scale 2030 analyst   
2          tesla building robotaxi there hidden cost   
3  cathie wood say wouldnt sold nvidia stake know...   
4       td cowen raise aapl target 250 ai china sale   

                                 Cleaned_Description  
0  exxonmobils xom development plan hammerhead in...  
1  tesla tsla share trading higher friday despite...  
2  tesla tsla report latest quarterly report tues...  
3  cathie wood investment fund ark invest sold le...  
4  investment analyst firm td cowen predicts appl...  


In [9]:


# Import necessary libraries
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import spacy

# Load the CSV files with cleaned data
df_cleaned_text = pd.read_csv('cleaned_news_article.csv')
df_cleaned_stock = pd.read_csv('cleaned_stock_article.csv')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to calculate sentiment scores
def sentiment_analysis(text):
    if isinstance(text, str):
        return analyzer.polarity_scores(text)['compound']
    else:
        return None

# Apply sentiment analysis to text data
df_cleaned_text['Title_Sentiment_Score'] = df_cleaned_text['Cleaned_Title'].apply(sentiment_analysis)
df_cleaned_text['Description_Sentiment_Score'] = df_cleaned_text['Cleaned_Description'].apply(sentiment_analysis)

# Convert 'Published At' column to datetime format
df_cleaned_text['Date'] = pd.to_datetime(df_cleaned_text['Published At'], errors='coerce')


# Calculate daily average sentiment scores
daily_sentiment = df_cleaned_text.groupby(df_cleaned_text['Date'].dt.date)['Title_Sentiment_Score'].mean()
daily_sentiment_df = daily_sentiment.reset_index()
daily_sentiment_df.columns = ['Date', 'Daily_Sentiment_Score']

# Convert 'Date' in daily_sentiment_df to datetime format for merging
daily_sentiment_df['Date'] = pd.to_datetime(daily_sentiment_df['Date'])

# Merge daily sentiment scores with stock data
df_cleaned_stock['Date'] = pd.to_datetime(df_cleaned_stock['Date'], errors='coerce')
df_cleaned_stock = pd.merge(df_cleaned_stock, daily_sentiment_df, on='Date', how='left')

# Aggregate sentiment scores by week
df_cleaned_text['Week'] = df_cleaned_text['Date'].dt.to_period('W').apply(lambda r: r.start_time)
weekly_sentiment = df_cleaned_text.groupby('Week')['Title_Sentiment_Score'].mean()
weekly_sentiment_df = weekly_sentiment.reset_index()
weekly_sentiment_df.columns = ['Week', 'Weekly_Sentiment_Score']

# Convert 'Week' in weekly_sentiment_df to datetime format for merging
weekly_sentiment_df['Week'] = pd.to_datetime(weekly_sentiment_df['Week'])

# Merge weekly sentiment scores with stock data
df_cleaned_stock['Week'] = df_cleaned_stock['Date'].dt.to_period('W').apply(lambda r: r.start_time)
df_cleaned_stock = pd.merge(df_cleaned_stock, weekly_sentiment_df, on='Week', how='left')

# Load the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define a function to get sentence embeddings using Sentence-BERT
def get_sentence_embedding(sentence):
    if isinstance(sentence, str):
        return model.encode(sentence)
    else:
        return np.zeros(model.get_sentence_embedding_dimension())


# Compute Word2Vec embeddings for each text
df_cleaned_text['Word2Vec_Embedding_Title'] = df_cleaned_text['Cleaned_Title'].apply(lambda x: get_sentence_embedding(x))
df_cleaned_text['Word2Vec_Embedding_Description'] = df_cleaned_text['Cleaned_Description'].apply(lambda x: get_sentence_embedding(x))

# Drop the original text columns
df_cleaned_text.drop(['Cleaned_Title', 'Cleaned_Description'], axis=1, inplace=True)
# Print the resulting DataFrame with sentiment scores and embeddings
print(df_cleaned_text[['Title_Sentiment_Score', 'Description_Sentiment_Score', 'Word2Vec_Embedding_Title','Word2Vec_Embedding_Description']].head())

# Save the DataFrame with sentiment scores and embeddings to a new CSV file
df_cleaned_text.to_csv('news_article_with_sentiment.csv', index=False)


  from tqdm.autonotebook import tqdm, trange
  df_cleaned_text['Week'] = df_cleaned_text['Date'].dt.to_period('W').apply(lambda r: r.start_time)


   Title_Sentiment_Score  Description_Sentiment_Score  \
0                 0.0000                      -0.3818   
1                -0.2755                      -0.0258   
2                 0.0000                       0.0000   
3                 0.5719                       0.2263   
4                 0.0000                       0.7650   

                            Word2Vec_Embedding_Title  \
0  [-0.023931492, -0.019036919, 0.07065436, -0.02...   
1  [-0.011779225, -0.031012023, -0.09955558, -0.0...   
2  [-0.062481813, 0.034244537, -0.031933334, 0.00...   
3  [0.026826339, -0.021405887, -0.026352998, -0.0...   
4  [-0.07889189, -0.034725867, 0.009962358, -0.06...   

                      Word2Vec_Embedding_Description  
0  [-0.018905027, 0.016774863, 0.024963025, -0.01...  
1  [-0.050417252, -0.05723135, -0.00022364523, 0....  
2  [-0.022078425, -0.018386573, -0.018221067, 0.0...  
3  [0.096326, -0.015384293, -0.072952285, 0.04721...  
4  [-0.044582628, -0.017881077, 0.0332282, -0

In [10]:
import pandas as pd

# Read the CSV files into DataFrames
df1 = pd.read_csv('cleaned_stock_article.csv')
df2 = pd.read_csv('news_article_with_sentiment.csv')

df2.drop(columns=["Title", "Description","Published At"], inplace=True)
# Convert 'Date' columns to datetime format for consistency
df1['Date'] = pd.to_datetime(df1['Date'], errors='coerce')
df2['Date'] = pd.to_datetime(df2['Date'], errors='coerce')

# Ensure both 'Date' columns are naive datetime (without timezone)
df2['Date'] = df2['Date'].dt.tz_localize(None)
df2['Year'] = df2['Date'].dt.year
df2['Month'] = df2['Date'].dt.month
df2['Day'] = df2['Date'].dt.day
concat_df = pd.concat([df1, df2], axis=1)

# Step 2: Verify the merge
print("Merged DataFrame head:")
print(concat_df.head())
print("Merged DataFrame info:")
print(concat_df.info())
# Save the merged DataFrame to a new CSV file
concat_df.to_csv('merged.csv', index=False)

Merged DataFrame head:
        Date Symbol      Open      High       Low     Close    Volume  \
0 2024-07-05   AAPL  1.187415  1.209815  1.228027  1.253106  0.377330   
1 2024-07-03   AAPL  1.162919  1.138307  1.188594  1.182101 -0.190375   
2 2024-07-02   AAPL  1.105761  1.121233  1.129445  1.163127  0.319033   
3 2024-07-01   AAPL  1.045485  1.079349  1.081584  1.110947  0.377097   
4 2024-06-28   AAPL  1.100119  1.058335  1.057202  1.020078  0.922559   

   Close_Lag1  Close_Lag2  Daily_Return  ...  Volume_Mean  \
0      227.82      228.68     -0.649636  ...  52645817.70   
1      226.34      227.82     -2.116285  ...  51737213.50   
2      221.55      226.34     -0.577748  ...  52137709.30   
3      220.27      221.55     -1.598039  ...  53075663.75   
4      216.75      220.27     -2.828143  ...  55387210.75   

   Title_Sentiment_Score  Description_Sentiment_Score                Date  \
0                 0.0000                      -0.3818 2024-07-16 13:11:00   
1                

In [11]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('merged.csv')

# Convert 'Date' to datetime and extract year, month, and day
df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop(columns=['Date'], inplace=True)  # Drop the original Date column
# One-hot encode the 'Symbol' column
df = pd.get_dummies(df, columns=['Symbol'])
# Shift the 'Close' prices to create 'Future_Close'
df['Future_Close'] = df['Close'].shift(-1)
# Drop the last row which will have NaN value in 'Future_Close'
df.dropna(inplace=True)

# Define target and features
y = df['Future_Close']
X = df.drop(['Future_Close'], axis=1)
# Ensure all feature columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')
X.dropna(inplace=True, axis=1)  # Drop columns with NaN values after conversion

# Initialize the models
linear_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor()

# Perform RFE with Linear Regression
rfe_linear = RFE(estimator=linear_model, n_features_to_select=10)
X_rfe_linear = rfe_linear.fit_transform(X, y)
ranking_linear = rfe_linear.ranking_

# Print feature rankings and selected features for Linear Regression
print("Feature Rankings (Linear Regression):", ranking_linear)
print("Selected Features (Linear Regression):", X.columns[rfe_linear.support_])

# Perform RFE with Decision Tree
rfe_tree = RFE(estimator=decision_tree_model, n_features_to_select=10)
X_rfe_tree = rfe_tree.fit_transform(X, y)
ranking_tree = rfe_tree.ranking_

# Print feature rankings and selected features for Decision Tree
print("Feature Rankings (Decision Tree):", ranking_tree)
print("Selected Features (Decision Tree):", X.columns[rfe_tree.support_])

# Split the data into training and test sets for both models
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X_rfe_linear, y, test_size=0.2, random_state=42)
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X_rfe_tree, y, test_size=0.2, random_state=42)

# Train the models on the selected features
linear_model.fit(X_train_linear, y_train_linear)
decision_tree_model.fit(X_train_tree, y_train_tree)

# Evaluate the models
linear_score = linear_model.score(X_test_linear, y_test_linear)
decision_tree_score = decision_tree_model.score(X_test_tree, y_test_tree)

print("Linear Regression Model Accuracy:", linear_score)
print("Decision Tree Model Accuracy:", decision_tree_score)


Feature Rankings (Linear Regression): [ 1  1  1  3  8 16 15  9 10 14  1 11  1  2 13  1  1  1  4 12 18  5  7 19
  6 17  1 21 20  1 22]
Selected Features (Linear Regression): Index(['Open', 'High', 'Low', 'Moving_Avg', 'SMA_20', 'EMA_12', 'EMA_26',
       'MACD', 'Symbol_AAPL', 'Symbol_TSLA'],
      dtype='object')
Feature Rankings (Decision Tree): [ 1  6  1  1  1  1 12  3  7  1  8 10  1  4  5  1 13  2  1  1 14 17  9 16
 15 11 18 19 20 21 22]
Selected Features (Decision Tree): Index(['Open', 'Low', 'Close', 'Volume', 'Close_Lag1', 'Return_Lag2', 'SMA_20',
       'EMA_12', 'MACD_Signal', 'Volatility'],
      dtype='object')
Linear Regression Model Accuracy: 0.9841223040294053
Decision Tree Model Accuracy: 0.5977344432569907


In [12]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import joblib

# Load your dataset
df = pd.read_csv('merged.csv')

# Preprocessing
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop(columns=['Date'], inplace=True)
df = pd.get_dummies(df, columns=['Symbol'])
df['Future_Close'] = df['Close'].shift(-1)
df.dropna(inplace=True)

# Define target and features
y = df['Future_Close']
X = df.drop(['Future_Close'], axis=1)

# Label encoding for categorical columns
label_encoders = {}
for column in ['Date.1', 'Week']:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Define embedding dimension
embedding_dim = 32  

# Flatten embeddings
def flatten_embeddings(df, column_name, num_features):
    # Check the type of embeddings and convert accordingly
    def parse_embedding(embedding):
        if isinstance(embedding, str):
            try:
                return np.fromstring(embedding.strip('[]'), sep=' ')
            except:
                return np.array([])
        else:
            return np.array(embedding)
    
    embeddings = df[column_name].apply(parse_embedding)
    for i in range(num_features):
        df[f'{column_name}_feature_{i}'] = embeddings.apply(lambda x: x[i] if len(x) > i else np.nan)

# Apply the flatten_embeddings function
flatten_embeddings(df, 'Word2Vec_Embedding_Description', num_features=embedding_dim)

# Remove original embedding columns
df = df.drop(columns=['Word2Vec_Embedding_Title', 'Word2Vec_Embedding_Description'])

# Ensure all columns are numeric
X = df.drop(['Future_Close'], axis=1)
X = X.apply(pd.to_numeric, errors='coerce')

# Check data types
print(X.dtypes)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Feature importance
importances = xgb_model.feature_importances_
feature_names = X.columns
importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(importances_df.sort_values(by='Importance', ascending=False))

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'),
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=3,
                           verbose=1)

grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {-grid_search.best_score_}")

# Save the model
joblib.dump(xgb_model, 'xgb_model.pkl')

# Load the model (example of how to load it later)
loaded_model = joblib.load('xgb_model.pkl')

# Make predictions with the loaded model
y_pred_loaded = loaded_model.predict(X_test)
mse_loaded = mean_squared_error(y_test, y_pred_loaded)
print(f"Mean Squared Error with loaded model: {mse_loaded}")


Open                                         float64
High                                         float64
Low                                          float64
Close                                        float64
Volume                                       float64
                                              ...   
Word2Vec_Embedding_Description_feature_27    float64
Word2Vec_Embedding_Description_feature_28    float64
Word2Vec_Embedding_Description_feature_29    float64
Word2Vec_Embedding_Description_feature_30    float64
Word2Vec_Embedding_Description_feature_31    float64
Length: 65, dtype: object
Mean Squared Error: 0.026234448537510606
                                     Feature  Importance
17                                      MACD    0.381805
2                                        Low    0.305199
0                                       Open    0.161827
36  Word2Vec_Embedding_Description_feature_3    0.084479
5                                 Close_Lag1    0.032280
..      

In [13]:
%pip install keras
%pip install tensorflow

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
import tensorflow as tf

# Load and preprocess your dataset
df = pd.read_csv('merged.csv')

# Convert Date to datetime and create new features
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop(columns=['Date'], inplace=True)
df['Future_Close'] = df['Close'].shift(-1)
df = pd.get_dummies(df, columns=['Symbol'])
df.dropna(inplace=True)

# Label encoding for categorical columns
label_encoders = {}
for column in ['Date.1', 'Week']:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Define embedding dimension
embedding_dim = 32  

# Flatten embeddings
def flatten_embeddings(df, column_name, num_features):
    def parse_embedding(embedding):
        if isinstance(embedding, str):
            try:
                return np.fromstring(embedding.strip('[]'), sep=' ')
            except:
                return np.array([])
        else:
            return np.array(embedding)
    
    embeddings = df[column_name].apply(parse_embedding)
    for i in range(num_features):
        df[f'{column_name}_feature_{i}'] = embeddings.apply(lambda x: x[i] if len(x) > i else np.nan)

# Apply the flatten_embeddings function
flatten_embeddings(df, 'Word2Vec_Embedding_Description', num_features=embedding_dim)

# Remove original embedding columns
df = df.drop(columns=['Word2Vec_Embedding_Title', 'Word2Vec_Embedding_Description'])

# Ensure all columns are numeric
X = df.drop(['Future_Close'], axis=1)
X = X.apply(pd.to_numeric, errors='coerce')

# Create lag features for LSTM input
def create_lagged_features(df, lag=1):
    for i in range(1, lag+1):
        df[f'Close_Lag{i}'] = df['Close'].shift(i)
    return df

# Add lag features
df = create_lagged_features(df, lag=5)
df.dropna(inplace=True)

# Define target and features
y = df['Future_Close']
X = df.drop(['Future_Close'], axis=1)

# Convert to NumPy arrays and ensure consistent dtype
X = X.values.astype(np.float32)
y = y.values.astype(np.float32)

# Reshape data for LSTM
timesteps = 5  # Number of timesteps for LSTM
num_features = X.shape[1]

def create_lstm_dataset(X, y, timesteps):
    X_lstm, y_lstm = [], []
    for i in range(len(X) - timesteps):
        X_lstm.append(X[i:i+timesteps])
        y_lstm.append(y[i+timesteps])
    return np.array(X_lstm), np.array(y_lstm)

X_lstm, y_lstm = create_lstm_dataset(X, y, timesteps)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42)

# Define LSTM model
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Initialize and train the model
input_shape = (timesteps, num_features)
lstm_model = build_lstm_model(input_shape)
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Make predictions
y_pred = lstm_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"LSTM Mean Squared Error: {mse}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Epoch 1/10


  super().__init__(**kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 188ms/step - loss: 0.6662 - val_loss: 0.4831
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.2857 - val_loss: 0.2247
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.1693 - val_loss: 0.1823
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.2270 - val_loss: 0.2042
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.2639 - val_loss: 0.1962
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.2457 - val_loss: 0.1814
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.1916 - val_loss: 0.1855
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.1714 - val_loss: 0.2127
Epoch 9/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1