In [112]:
!pip install yfinance fastai scikit-learn ta
import yfinance as yf
import pandas as pd
import numpy as np
from fastai.tabular.all import *
from ta.momentum import RSIIndicator
from ta.trend import SMAIndicator
from sklearn.metrics import accuracy_score

[33mDEPRECATION: Loading egg at /home/ahsinali/anaconda3/lib/python3.12/site-packages/v20-3.0.25.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m




In [113]:
def download_stock_data(ticker, start_date, end_date):
    df = yf.download(ticker, start=start_date, end=end_date)
    df.reset_index(inplace=True)
    return df

In [114]:
def add_technical_indicators(df):
    # RSI (14 period)
    rsi = RSIIndicator(close=df['Close'].squeeze(), window=14)
    df['RSI'] = rsi.rsi()
    
    # SMAs
    sma10 = SMAIndicator(close=df['Close'].squeeze(), window=10)
    df['SMA_10'] = sma10.sma_indicator()
    
    sma50 = SMAIndicator(close=df['Close'].squeeze(), window=50)
    df['SMA_50'] = sma50.sma_indicator()
    
    # Create target - next day's price direction (1 if up, 0 if down)
    df['Target'] = (df['Close'].squeeze().shift(-1) > df['Close'].squeeze()).astype(int)
    df['Target'] = df['Target'].astype(int).astype(str)
    # Drop rows with NaN values (from indicator calculations)
    df.dropna(inplace=True)
    
    return df

In [115]:
ticker = 'AAPL'
start_date = '2020-01-01'
end_date = '2023-12-31'

raw_data = download_stock_data(ticker, start_date, end_date)


  df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


In [116]:
# raw_data = raw_data.reset_index(drop=True)

raw_data.columns = raw_data.columns.get_level_values(0)

# Remove any multi-index on rows
raw_data = raw_data.reset_index(drop=True)
raw_data.head()

Price,Date,Close,High,Low,Open,Volume
0,2020-01-02,72.620819,72.681266,71.373196,71.627069,135480400
1,2020-01-03,71.91481,72.676439,71.68995,71.84711,146322800
2,2020-01-06,72.487869,72.526556,70.783271,71.034732,118387200
3,2020-01-07,72.146927,72.753808,71.9269,72.497514,108872000
4,2020-01-08,73.30751,73.609745,71.849533,71.849533,132079200


In [117]:
data = add_technical_indicators(raw_data)

In [118]:
data.head()

Price,Date,Close,High,Low,Open,Volume,RSI,SMA_10,SMA_50,Target
49,2020-03-13,67.369148,67.841756,61.30527,64.199067,370732000,45.241601,68.490311,74.025906,0
50,2020-03-16,58.702312,62.790941,58.166693,58.639296,322423600,36.84704,67.118546,73.747536,1
51,2020-03-17,61.283451,62.434663,57.778907,59.986817,324056000,40.394225,66.234896,73.534909,0
52,2020-03-18,59.783241,60.590305,57.468691,58.11095,300233600,39.022306,64.875977,73.280816,0
53,2020-03-19,59.32518,61.278611,58.799257,59.957743,271857200,38.591328,63.709251,73.024382,0


In [119]:
def prepare_tabular_data(df):
    # Define features and target
    cont_names = ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI', 'SMA_10', 'SMA_50']
    dep_var = 'Target'
    
    # Split data into train and validation sets (time-based split)
    split_idx = int(0.8 * len(df))
    train_df = df.iloc[:split_idx].copy()
    valid_df = df.iloc[split_idx:].copy()
    
    # TabularPandas for preprocessing
    procs = [Categorify, FillMissing, Normalize]
    to = TabularPandas(train_df, procs=procs, cat_names=[], 
                       cont_names=cont_names, y_names=dep_var,y_block=CategoryBlock(),
                       splits=RandomSplitter(valid_pct=0.2)(range_of(train_df)))
    
    return to, cont_names, dep_var

In [120]:
to, cont_names, dep_var = prepare_tabular_data(data)

In [121]:
raw_data.iloc[:10].copy()

Price,Date,Close,High,Low,Open,Volume,RSI,SMA_10,SMA_50,Target
49,2020-03-13,67.369148,67.841756,61.30527,64.199067,370732000,45.241601,68.490311,74.025906,0
50,2020-03-16,58.702312,62.790941,58.166693,58.639296,322423600,36.84704,67.118546,73.747536,1
51,2020-03-17,61.283451,62.434663,57.778907,59.986817,324056000,40.394225,66.234896,73.534909,0
52,2020-03-18,59.783241,60.590305,57.468691,58.11095,300233600,39.022306,64.875977,73.280816,0
53,2020-03-19,59.32518,61.278611,58.799257,59.957743,271857200,38.591328,63.709251,73.024382,0
54,2020-03-20,55.558884,61.033822,55.258354,59.906841,401693200,35.153485,62.260175,72.669409,0
55,2020-03-23,54.378578,55.379531,51.528412,55.27774,336752800,34.127444,61.247102,72.259688,1
56,2020-03-24,59.834122,60.030434,56.785218,57.284481,287531200,42.483765,60.314979,71.955693,0
57,2020-03-25,59.504528,62.589785,59.208847,60.772076,303602000,42.135997,59.590076,71.613045,1
58,2020-03-26,62.635818,62.693983,59.708095,59.746874,252087200,46.607712,59.837526,71.35372,0


In [122]:
dls = to.dataloaders(bs=64)

In [123]:
def train_model(dls, metrics=[accuracy]):
    learn = tabular_learner(dls, layers=[200, 100], metrics=metrics)
    learn.fit_one_cycle(10, 1e-2)
    return learn

learn = train_model(dls)

epoch,train_loss,valid_loss,accuracy,time
0,0.729987,0.717202,0.48366,00:00
1,0.746577,0.780278,0.464052,00:00
2,0.724027,0.88518,0.444444,00:00
3,0.71111,0.948991,0.437909,00:00
4,0.706953,0.789762,0.470588,00:00
5,0.697552,0.892721,0.437909,00:00
6,0.698952,0.728584,0.555556,00:00
7,0.696968,0.761221,0.477124,00:00
8,0.692524,0.824721,0.45098,00:00
9,0.693946,0.816147,0.457516,00:00


In [124]:
def backtest(model, df, cont_names, initial_capital=10000, commission=0.001):
    """
    Backtest the model over the last 6 months of data
    Returns accuracy and portfolio value over time
    """
    # Get the last 6 months of data
    test_df = df.iloc[-126:]  # ~6 months of trading days
    
    # Create test dataloader
    test_dl = model.dls.test_dl(test_df[cont_names + ['Target']])

    # Get predictions
    preds , _  = model.get_preds(dl=test_dl)
 
    predicted_directions = preds.argmax(dim=1).numpy()
    actual_directions = test_df['Target'].values.astype(int)
    # Calculate accuracy
    accuracy = accuracy_score(actual_directions, predicted_directions)
    
    # Simulate trading
    capital = initial_capital
    position = 0
    portfolio_values = []
    
    for i in range(len(test_df)):
        current_price = test_df.iloc[i]['Close']
        prediction = predicted_directions[i]
        
        # Close existing position
        if position != 0:
            capital += position * current_price * (1 - commission)
            position = 0
        
        # Open new position based on prediction
        if prediction:  # Predicts up
            position = capital / current_price
            capital = 0
        
        # Record portfolio value
        portfolio_value = capital + position * current_price if position != 0 else capital
        portfolio_values.append(portfolio_value)
    
    # Calculate returns
    returns = (portfolio_values[-1] - initial_capital) / initial_capital * 100
    
    return accuracy, returns, portfolio_values

# Run backtest
accuracy, returns, portfolio_values = backtest(learn, data, cont_names)
print(f"Backtest Accuracy: {accuracy:.2%}")
print(f"Backtest Returns: {returns:.2f}%")

Backtest Accuracy: 45.24%
Backtest Returns: 0.00%


In [125]:
model_path = 'stock_direction_model'
learn.export(model_path + '.pkl')

# Load model
loaded_learn = load_learner(model_path + '.pkl')

# Test loaded model
test_row = data.iloc[-1:][cont_names]
pred, _, probs = loaded_learn.predict(test_row.iloc[0])
print(f"Prediction: {pred}, Probability: {probs[0].item():.2%}")

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


Prediction:        Open      High       Low     Close   Volume       RSI    SMA_10  \
0  2.270297  2.200969  2.269889  2.209561 -1.27614 -0.249253  2.293045   

     SMA_50  Target  
0  2.012524     0.0  , Probability: 96.16%


In [127]:
probs

tensor([0.9616, 0.0384])