In [9]:
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import glob

from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import make_scorer, mean_squared_error
from math import sqrt


import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

In [10]:
# Definir random state e métrica
seed = 1
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

scorer = make_scorer(rmse, greater_is_better=False)
cv = TimeSeriesSplit(n_splits=5)


In [11]:
# Novo conjunto de algoritmos
algorithms = {
    'Decision Tree': GridSearchCV(
        Pipeline([('scaler', RobustScaler()), ('tree', DecisionTreeRegressor(random_state=seed))]),
        param_grid={
            'tree__max_depth': [10, 20, 30],
            'tree__criterion': ['squared_error', 'friedman_mse'],
        },
        scoring=scorer,
        cv=cv,
    ),
    'KNN': GridSearchCV(
        Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsRegressor())]),
        param_grid={
            'knn__n_neighbors': [3, 5, 7],
            'knn__weights': ['uniform', 'distance'],
        },
        scoring=scorer,
        cv=cv,
    ),    
    'MLP': GridSearchCV(
        Pipeline([('scaler', StandardScaler()), ('mlp', MLPRegressor(random_state=seed, max_iter=500))]),
        param_grid={
            'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'mlp__activation': ['relu', 'tanh', 'logistic'],
        },
        scoring=scorer,
        cv=cv,
    ),
    'Random Forest': GridSearchCV(
        Pipeline([
            ('scaler', StandardScaler()),
            ('rf', RandomForestRegressor(random_state=seed))]),
        param_grid={
            'rf__n_estimators': [50, 100],
            'rf__max_depth': [10, 20],
        },
        scoring=scorer,
        cv=cv,
    ),
    'AdaBoost': GridSearchCV(
        Pipeline([('scaler', StandardScaler()), ('ada', AdaBoostRegressor(random_state=seed))]),
        param_grid={
            'ada__n_estimators': [50, 100, 200],
            'ada__learning_rate': [0.01, 0.1, 1],
        },
        scoring=scorer,
        cv=cv,
    ), 
    'Gradient Boosting': GridSearchCV(
        Pipeline([('scaler', StandardScaler()), ('gb', GradientBoostingRegressor(random_state=seed))]),
        param_grid={
            'gb__n_estimators': [50, 100, 200],
            'gb__learning_rate': [0.01, 0.1, 0.2],
            'gb__max_depth': [3, 5, 7],
        },
        scoring=scorer,
        cv=cv,
    )

}

In [12]:
# Use a wildcard pattern to match any CSV file in the yahoo_data subfolder
file_path_pattern = 'yahoo_data/yahoo_data*.csv'

# Find all files matching the pattern
file_list = glob.glob(file_path_pattern)

# Check if any files matched
if file_list:
    # Get the first matching file (if there are multiple)
    file_path = file_list[-1]
    
    # Read the CSV file
    data = pd.read_csv(file_path)
    
    # Display the first few rows of the data
    print(data.head(),'\n\n',data.tail())

else:
    print("No files found matching the pattern.")


   stock        date   Open   High    Low  Close    Volume   Adjusted
0  ABEV3  2022-01-03  15.42  15.54  15.15  15.33  23833600  13.839315
1  ABEV3  2022-01-04  15.32  15.40  15.04  15.31  16357700  13.821261
2  ABEV3  2022-01-05  15.25  15.29  14.96  15.01  24652000  13.550432
3  ABEV3  2022-01-06  15.04  15.06  14.66  14.77  20406300  13.333770
4  ABEV3  2022-01-07  14.80  14.80  14.45  14.53  19673600  13.117107 

        stock        date  Open  High   Low  Close  Volume  Adjusted
54435  WIZC3  2024-11-12  5.85  5.87  5.70   5.77  468800      5.77
54436  WIZC3  2024-11-13  5.80  5.80  5.66   5.77  525300      5.77
54437  WIZC3  2024-11-14  5.77  5.85  5.71   5.79  411300      5.79
54438  WIZC3  2024-11-18  5.80  5.89  5.75   5.89  563300      5.89
54439  WIZC3  2024-11-19  5.89  5.99  5.84   5.91  398600      5.91


In [13]:
days = 5

In [14]:
data = data.drop(columns=['Open', 'High', 'Low', 'Close'])
data['Future'] = data['Adjusted'].shift(-days)
data = data.dropna()

print(data.head(),'\n\n',data.tail())

tickers = data['stock'].unique()
print(tickers)


   stock        date    Volume   Adjusted     Future
0  ABEV3  2022-01-03  23833600  13.839315  13.225439
1  ABEV3  2022-01-04  16357700  13.821261  13.252522
2  ABEV3  2022-01-05  24652000  13.550432  13.433074
3  ABEV3  2022-01-06  20406300  13.333770  13.532376
4  ABEV3  2022-01-07  19673600  13.117107  13.631681 

        stock        date  Volume  Adjusted  Future
54430  WIZC3  2024-11-05  295400      5.85    5.77
54431  WIZC3  2024-11-06  331600      5.89    5.77
54432  WIZC3  2024-11-07  406800      5.85    5.79
54433  WIZC3  2024-11-08  947500      5.84    5.89
54434  WIZC3  2024-11-11  346600      5.85    5.91
['ABEV3' 'AGRO3' 'ALOS3' 'AMER3' 'ARML3' 'AURE3' 'B3SA3' 'BLAU3' 'CMIG3'
 'CMIG4' 'CPFE3' 'CPLE3' 'CPLE6' 'CSAN3' 'CSMG3' 'CURY3' 'CYRE3' 'DIRR3'
 'ELET3' 'ELET6' 'ELMD3' 'ENEV3' 'ENGI11' 'EVEN3' 'EZTC3' 'FESA4' 'FIQE3'
 'GFSA3' 'GRND3' 'HBRE3' 'HYPE3' 'IGTI11' 'INTB3' 'ITSA3' 'ITSA4' 'JHSF3'
 'KEPL3' 'LAVV3' 'LEVE3' 'LOGG3' 'MDNE3' 'MELK3' 'MILS3' 'MULT3' 'ODPV3'
 'ORVR

In [15]:
# Dictionary to store train and test sets for each ticker
ticker_train_test_data = {}

# Df to store predictions
predictions = pd.DataFrame(columns=['date', 'stock', 'Adjusted', 'Prediction', 'Model'])
predictions

Unnamed: 0,date,stock,Adjusted,Prediction,Model


In [16]:
# Define the test period (last 30 rows)
test_rows = 30
results = []
for ticker in data['stock'].unique():
    ticker_data = data[data['stock'] == ticker]
    print(ticker)

    # Check if there are enough rows for a 30-row test set
    if len(ticker_data) > test_rows:
        # Split into train and test
        train_data = ticker_data.iloc[:-test_rows]
        test_data = ticker_data.iloc[-test_rows:]
    else:
        # If not enough data, use all data for training and leave test empty
        train_data = ticker_data
        test_data = pd.DataFrame()  # Empty DataFrame for test set
        print("There is NO data enough for train and test")
    
    # Store train and test sets in the dictionary
    ticker_train_test_data[ticker] = {
        'train': train_data,
        'test': test_data
    }

    print(train_data.head())

    # Executar cross-validation e coletar as melhores métricas
    results_ticker = []

    for name, model in algorithms.items():
        model.fit(train_data[['Adjusted', 'Volume']], train_data['Future'])
        y_pred = model.predict(test_data[['Adjusted', 'Volume']])
        rmse_value = rmse(test_data['Future'], y_pred)
        results_ticker.append((name, rmse_value, ticker))
        
        #save predictions
        predictions = pd.concat([predictions, pd.DataFrame({'stock': ticker, 'date': test_data['date'], 'Adjusted': test_data['Adjusted'] , 'Prediction': y_pred, 'Model': name})])
        print(predictions.tail())
    
    #print(test_data.head())
    results_ticker = pd.DataFrame(results_ticker, columns=['Algorithm', 'RMSE', 'Ticker'])
    print(results_ticker)

    results.append(results_ticker)

ABEV3
   stock        date    Volume   Adjusted     Future
0  ABEV3  2022-01-03  23833600  13.839315  13.225439
1  ABEV3  2022-01-04  16357700  13.821261  13.252522
2  ABEV3  2022-01-05  24652000  13.550432  13.433074
3  ABEV3  2022-01-06  20406300  13.333770  13.532376
4  ABEV3  2022-01-07  19673600  13.117107  13.631681
           date  stock  Adjusted  Prediction          Model
718  2024-11-12  ABEV3     12.58   12.428966  Decision Tree
719  2024-11-13  ABEV3     12.73   12.842442  Decision Tree
720  2024-11-14  ABEV3     12.64   13.295069  Decision Tree
721  2024-11-18  ABEV3     12.42   12.416639  Decision Tree
722  2024-11-19  ABEV3     12.43   12.819196  Decision Tree


  predictions = pd.concat([predictions, pd.DataFrame({'stock': ticker, 'date': test_data['date'], 'Adjusted': test_data['Adjusted'] , 'Prediction': y_pred, 'Model': name})])


           date  stock  Adjusted  Prediction Model
718  2024-11-12  ABEV3     12.58   12.410465   KNN
719  2024-11-13  ABEV3     12.73   12.795246   KNN
720  2024-11-14  ABEV3     12.64   12.526181   KNN
721  2024-11-18  ABEV3     12.42   12.425765   KNN
722  2024-11-19  ABEV3     12.43   12.509098   KNN
           date  stock  Adjusted  Prediction Model
718  2024-11-12  ABEV3     12.58   12.630307   MLP
719  2024-11-13  ABEV3     12.73   12.759354   MLP
720  2024-11-14  ABEV3     12.64   12.664443   MLP
721  2024-11-18  ABEV3     12.42   12.467499   MLP
722  2024-11-19  ABEV3     12.43   12.492000   MLP
           date  stock  Adjusted  Prediction          Model
718  2024-11-12  ABEV3     12.58   12.451218  Random Forest
719  2024-11-13  ABEV3     12.73   12.806292  Random Forest
720  2024-11-14  ABEV3     12.64   12.903756  Random Forest
721  2024-11-18  ABEV3     12.42   12.555392  Random Forest
722  2024-11-19  ABEV3     12.43   12.631475  Random Forest
           date  stock  Adju

In [17]:

print(results)
print('\n\n',predictions)

[           Algorithm      RMSE Ticker
0      Decision Tree  2.923042  ABEV3
1                KNN  2.988892  ABEV3
2                MLP  2.951994  ABEV3
3      Random Forest  2.934138  ABEV3
4           AdaBoost  2.947377  ABEV3
5  Gradient Boosting  2.941046  ABEV3,            Algorithm      RMSE Ticker
0      Decision Tree  1.292476  AGRO3
1                KNN  1.223127  AGRO3
2                MLP  1.145003  AGRO3
3      Random Forest  1.232401  AGRO3
4           AdaBoost  1.207395  AGRO3
5  Gradient Boosting  1.199997  AGRO3,            Algorithm         RMSE Ticker
0      Decision Tree  1158.702838  ALOS3
1                KNN  1158.650746  ALOS3
2                MLP  1158.462376  ALOS3
3      Random Forest  1158.651560  ALOS3
4           AdaBoost  1158.643244  ALOS3
5  Gradient Boosting  1158.631585  ALOS3,            Algorithm       RMSE Ticker
0      Decision Tree  11.828714  AMER3
1                KNN  27.261078  AMER3
2                MLP  31.037031  AMER3
3      Random Forest 

In [None]:
# Concatenate all results into a single DataFrame
final_results = pd.concat(results, ignore_index=True)

# Save the results to a CSV file
final_results.to_csv(path_or_buf='models_results/model_results.csv', index=False)
predictions.to_csv(path_or_buf='models_results/predictions.csv', index=False)

print("Results saved to model_results.csv, and predictions.csv")


Results saved to model_results.csv, and predictions.csv


"\n# Treinar e prever o resultado para cada ticker\nfor ticker in ticker_train_test_data:\n    train_data = ticker_train_test_data[ticker]['train']\n    test_data = ticker_train_test_data[ticker]['test']\n    \n    if not test_data.empty:\n        # Treinar o melhor modelo\n        best_model = algorithms[final_results.loc[final_results['RMSE'].idxmin(), 'Algorithm']]\n        best_model.fit(train_data[['log_return']], train_data['result'])\n\n        # Prever o resultado\n        y_pred = best_model.predict(test_data[['log_return']])\n        test_data['predicted_result'] = y_pred\n\n        # Plotar o resultado\n        plt.figure(figsize=(10, 5))\n        plt.plot(test_data['Date'], test_data['result'], label='Real')\n        plt.plot(test_data['Date'], test_data['predicted_result'], label='Predicted')\n        plt.title(f'{ticker} - Result Prediction')\n        plt.legend()\n\n        plt.show()\n\n        # Salvar o modelo\n        model_name = f'{ticker}_model.pkl'\n        jobli