In [33]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import date


In [34]:
#import nvidia data from yfinance

START = "2015-01-01"
TODAY = date.today().strftime("%Y-%m-%d")

df = yf.download ('NVDA', START, TODAY)

[*********************100%***********************]  1 of 1 completed


In [35]:
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-08-28,128.119995,128.330002,122.639999,125.610001,125.610001,448101100
2024-08-29,121.360001,124.43,116.709999,117.589996,117.589996,453023300
2024-08-30,119.529999,121.75,117.220001,119.370003,119.370003,333751600
2024-09-03,116.010002,116.209999,107.290001,108.0,108.0,474040800
2024-09-04,105.410004,113.269997,104.120003,106.209999,106.209999,370960700


In [36]:
df.dropna()
print(df.isnull().sum())

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [37]:
#Moving Averages
df['SMA200'] = df['Close'].rolling(window=200).mean()
df['SMA50'] = df['Close'].rolling(window=50).mean()


In [38]:
# Bollinger Bands
df['BB_upper'] = df['Close'].rolling(window=20).mean() + 2*df['Close'].rolling(window=20).std()
df['BB_lower'] = df['Close'].rolling(window=20).mean() - 2*df['Close'].rolling(window=20).std()


In [39]:
#Calculate the price changes (difference between consecutive days)
delta = df['Close'].diff()

#Separate gains and losses
gain = delta.where(delta > 0, 0)  # Only positive price changes (gains)
loss = -delta.where(delta < 0, 0)  # Only negative price changes (losses as positive values)

#Calculate the rolling average of gains and losses (default window is 14 days)
window = 14
avg_gain = gain.rolling(window=window).mean()
avg_loss = loss.rolling(window=window).mean()

#Calculate the Relative Strength (RS)
rs = avg_gain / avg_loss

#Calculate the RSI using the formula
df['RSI'] = 100 - (100 / (1 + rs))

In [40]:
# Calculate the 12-period EMA
df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()

# Calculate the 26-period EMA
df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()

# Calculate MACD (the difference between 12-period EMA and 26-period EMA)
df['MACD'] = df['EMA12'] - df['EMA26']

# Calculate the 9-period EMA of MACD (Signal Line)
df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()

In [41]:
# Manually calculate the percentage returns
df['returns'] = (df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)

In [44]:
df.drop('Adj Close', axis=1)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA200,SMA50,BB_upper,BB_lower,RSI,EMA12,EMA26,MACD,Signal_Line,returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-01-02,0.503250,0.507000,0.495250,0.503250,113680000,,,,,,0.503250,0.503250,0.000000,0.000000,
2015-01-05,0.503250,0.504750,0.492500,0.494750,197952000,,,,,,0.501942,0.502620,-0.000678,-0.000136,-0.016890
2015-01-06,0.495500,0.496000,0.479250,0.479750,197764000,,,,,,0.498528,0.500926,-0.002398,-0.000588,-0.030318
2015-01-07,0.483250,0.487500,0.477000,0.478500,321808000,,,,,,0.495447,0.499265,-0.003818,-0.001234,-0.002606
2015-01-08,0.484000,0.499500,0.483750,0.496500,283780000,,,,,,0.495609,0.499060,-0.003451,-0.001678,0.037618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-28,128.119995,128.330002,122.639999,125.610001,448101100,87.280655,120.6572,138.833812,95.138188,71.818180,123.787604,120.465381,3.322223,1.980273,-0.020967
2024-08-29,121.360001,124.430000,116.709999,117.589996,453023300,87.626930,120.2974,138.944144,95.865855,61.651538,122.834126,120.252390,2.581736,2.100565,-0.063848
2024-08-30,119.529999,121.750000,117.220001,119.370003,333751600,87.980680,120.0692,139.023843,96.996157,59.836536,122.301184,120.187028,2.114156,2.103284,0.015137
2024-09-03,116.010002,116.209999,107.290001,108.000000,474040800,88.272400,119.6978,138.316367,98.458633,42.842071,120.101002,119.284285,0.816717,1.845970,-0.095250


In [53]:
#Split target and features
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA200', 'SMA50', 'RSI', 'MACD', 'Signal_Line', 'BB_upper', 'BB_lower', 'returns']
X = df[features]
y = df['Close']



In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10, shuffle=True)