# TRAIN MODEL 

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
import pickle

In [None]:
# Connect to MongoDB
client = MongoClient('LINK')
db = client['intelliinvest']
fundamentals_collection = db['STOCK_FUNDAMENTALS']
signals_collection = db['STOCK_SIGNALS_COMPONENTS_10']


In [None]:
# Retrieve all securityIds
fundamentals_security_ids = fundamentals_collection.distinct('securityId')
signals_security_ids = signals_collection.distinct('securityId')
all_security_ids = set(fundamentals_security_ids).intersection(set(signals_security_ids))

In [None]:
# Loop through each securityId
for security_id in all_security_ids:
    query = {"securityId": security_id}
    fundamentals_results = fundamentals_collection.find(query)
    signals_results = signals_collection.find(query)

    fundamentals_df = pd.DataFrame(list(fundamentals_results))
    signals_df = pd.DataFrame(list(signals_results))

    fundamentals_df = fundamentals_df.rename(columns={'todayDate': 'signalDate'})
    fundamentals_df['signalDate'] = pd.to_datetime(fundamentals_df['signalDate'])
    signals_df['signalDate'] = pd.to_datetime(signals_df['signalDate'])
    
    merged_data = pd.merge(fundamentals_df, signals_df, on=['securityId', 'signalDate'], how='inner')

    selected_columns = ['signalDate', 'closePrice', 'securityId', 'alMarketCap', 'stdDevOfReturn', 'TRn', 'ADXn', 'high10Day', 'low10Day', 'stochastic10Day', 'range10Day', 'percentKFlow', 'percentDFlow', 'upperBound', 'lowerBound', 'bandwidth', 'movingAverage_5', 'movingAverage_10', 'movingAverage_15', 'movingAverage_25', 'movingAverage_50']
    df = merged_data[selected_columns]

    df.set_index('securityId', inplace=True)

    df.loc[:, 'signalDate'] = df['signalDate'].dt.date
    df.loc[:, 'signalDate'] = pd.to_datetime(df['signalDate'])
    df.loc[:, 'signalDate'] = df['signalDate'].astype(str)
    
    df = df.sort_values(by='signalDate')

    df['cap_category'] = pd.cut(df['alMarketCap'], bins=[-np.inf, df['alMarketCap'].median(), np.inf], labels=['Low Midcap', 'High Midcap'])
    df['std_category'] = pd.cut(df['stdDevOfReturn'], bins=[-np.inf, df['stdDevOfReturn'].median(), np.inf], labels=['Low STD', 'High STD'])
    df['category'] = df['cap_category'].astype(str) + ', ' + df['std_category'].astype(str)
    
    train_df = df[df['signalDate'] < df['signalDate'].max()]
    
    if len(train_df) < 3:
        print(f"Not enough data for securityId: {security_id}, skipping.")
        continue
    
    X_train = train_df.drop(columns=['closePrice', 'signalDate'])
    y_train = train_df['closePrice']
    
    param_grid_lr = {
        'n_estimators': [10, 50],
        'max_samples': [0.8, 1.0]
    }
    
    param_grid_dt = {
        'base_estimator__max_depth': [10, 20],
        'n_estimators': [10, 50],
        'max_samples': [0.8, 1.0]
    }
    
    param_grid_gb = {
        'base_estimator__n_estimators': [50, 100],
        'base_estimator__learning_rate': [0.01, 0.1],
        'n_estimators': [10, 50],
        'max_samples': [0.8, 1.0]
    }

    grid_lr = GridSearchCV(estimator=BaggingRegressor(base_estimator=Ridge(), random_state=42),
                           param_grid=param_grid_lr, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_lr.fit(X_train, y_train)
    
    grid_dt = GridSearchCV(estimator=BaggingRegressor(base_estimator=DecisionTreeRegressor(), random_state=42),
                           param_grid=param_grid_dt, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_dt.fit(X_train, y_train)
    
    grid_gb = GridSearchCV(estimator=BaggingRegressor(base_estimator=GradientBoostingRegressor(), random_state=42),
                           param_grid=param_grid_gb, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_gb.fit(X_train, y_train)
    
    with open(f'models/lr_model_{security_id}.pkl', 'wb') as f:
        pickle.dump(grid_lr.best_estimator_, f)
    with open(f'models/dt_model_{security_id}.pkl', 'wb') as f:
        pickle.dump(grid_dt.best_estimator_, f)
    with open(f'models/gb_model_{security_id}.pkl', 'wb') as f:
        pickle.dump(grid_gb.best_estimator_, f)
    
    print(f"Models trained and saved for securityId: {security_id}")


# TEST MODEL

In [None]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import pickle
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error

In [None]:
# Connect to MongoDB
client = MongoClient('LINK')
db = client['intelliinvest']
fundamentals_collection = db['STOCK_FUNDAMENTALS']
signals_collection = db['STOCK_SIGNALS_COMPONENTS_10']
predictions_collection = db['STOCK_PREDICTIONS']

In [None]:
# Function to calculate RMSE
def calculate_rmse(true_values, predictions):
    return np.sqrt(mean_squared_error(true_values, predictions))

In [None]:
# Function to calculate percentage error
def calculate_percentage_error(true_values, predictions):
    return np.abs((true_values - predictions) / true_values) * 100

In [None]:
# Retrieve all securityIds
fundamentals_security_ids = fundamentals_collection.distinct('securityId')
signals_security_ids = signals_collection.distinct('securityId')
all_security_ids = set(fundamentals_security_ids).intersection(set(signals_security_ids))

In [None]:
# Loop through each securityId
for security_id in all_security_ids:
    query = {"securityId": security_id}
    fundamentals_results = fundamentals_collection.find(query)
    signals_results = signals_collection.find(query)

    fundamentals_df = pd.DataFrame(list(fundamentals_results))
    signals_df = pd.DataFrame(list(signals_results))

    fundamentals_df = fundamentals_df.rename(columns={'todayDate': 'signalDate'})
    fundamentals_df['signalDate'] = pd.to_datetime(fundamentals_df['signalDate'])
    signals_df['signalDate'] = pd.to_datetime(signals_df['signalDate'])
    
    merged_data = pd.merge(fundamentals_df, signals_df, on=['securityId', 'signalDate'], how='inner')

    selected_columns = ['signalDate', 'closePrice', 'securityId', 'alMarketCap', 'stdDevOfReturn', 'TRn', 'ADXn', 'high10Day', 'low10Day', 'stochastic10Day', 'range10Day', 'percentKFlow', 'percentDFlow', 'upperBound', 'lowerBound', 'bandwidth', 'movingAverage_5', 'movingAverage_10', 'movingAverage_15', 'movingAverage_25', 'movingAverage_50']
    df = merged_data[selected_columns]

    df.set_index('securityId', inplace=True)

    df.loc[:, 'signalDate'] = df['signalDate'].dt.date
    df.loc[:, 'signalDate'] = pd.to_datetime(df['signalDate'])
    df.loc[:, 'signalDate'] = df['signalDate'].astype(str)
    
    df = df.sort_values(by='signalDate')

    df['cap_category'] = pd.cut(df['alMarketCap'], bins=[-np.inf, df['alMarketCap'].median(), np.inf], labels=['Low Midcap', 'High Midcap'])
    df['std_category'] = pd.cut(df['stdDevOfReturn'], bins=[-np.inf, df['stdDevOfReturn'].median(), np.inf], labels=['Low STD', 'High STD'])
    df['category'] = df['cap_category'].astype(str) + ', ' + df['std_category'].astype(str)

    if len(df) < 3:
        print(f"Not enough data for securityId: {security_id}, skipping.")
        continue
    
    X = df.drop(columns=['closePrice', 'signalDate'])
    y = df['closePrice']

    try:
        with open(f'models/lr_model_{security_id}.pkl', 'rb') as f:
            lr_model = pickle.load(f)
        with open(f'models/dt_model_{security_id}.pkl', 'rb') as f:
            dt_model = pickle.load(f)
        with open(f'models/gb_model_{security_id}.pkl', 'rb') as f:
            gb_model = pickle.load(f)
    except FileNotFoundError:
        print(f"Model files not found for securityId: {security_id}, skipping.")
        continue

    periods = {
        'daily': 1,
        'weekly': 7,
        'monthly': 30,
        'quarterly': 90
    }

    last_date = pd.to_datetime(df['signalDate'].max())

    for period, days in periods.items():
        date = last_date + timedelta(days=days)
        new_data = df.iloc[-1:].copy()
        new_data['signalDate'] = date
        new_data = new_data.drop(columns=['closePrice'])

        lr_predictions = lr_model.predict(new_data.drop(columns=['signalDate']))
        dt_predictions = dt_model.predict(new_data.drop(columns=['signalDate']))
        gb_predictions = gb_model.predict(new_data.drop(columns=['signalDate']))

        prediction_doc = {
            'securityId': security_id,
            'signalDate': date,
            'period': period,
            'Bagging_LR_Prediction': lr_predictions[0],
            'Bagging_DT_Prediction': dt_predictions[0],
            'Bagging_GB_Prediction': gb_predictions[0],
            'Actual_Price': df['closePrice'].iloc[-1],
            'Bagging_LR_Percentage_Error': calculate_percentage_error(df['closePrice'].iloc[-1], lr_predictions[0]),
            'Bagging_DT_Percentage_Error': calculate_percentage_error(df['closePrice'].iloc[-1], dt_predictions[0]),
            'Bagging_GB_Percentage_Error': calculate_percentage_error(df['closePrice'].iloc[-1], gb_predictions[0]),
            'Data_Category': df['category'].iloc[-1]
        }
        predictions_collection.insert_one(prediction_doc)
        print(f"Predictions stored for securityId: {security_id}, date: {date}, period: {period}")
