In [1]:
import pandas as pd
from pathlib import Path
import hvplot.pandas

# Load the data
path = stockdata_csv_path = Path("Resources/NFLX.csv")
data = pd.read_csv(path)

# Convert 'Date' to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Handle missing values
data = data.dropna()

In [2]:
# Show DataFrame
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2002-05-23,1.156429,1.242857,1.145714,1.196429,1.196429,104790000
1,2002-05-24,1.214286,1.225,1.197143,1.21,1.21,11104800
2,2002-05-28,1.213571,1.232143,1.157143,1.157143,1.157143,6609400
3,2002-05-29,1.164286,1.164286,1.085714,1.103571,1.103571,6757800
4,2002-05-30,1.107857,1.107857,1.071429,1.071429,1.071429,10154200


In [3]:
# create plot to visualize data

netflix_plot = data.hvplot.scatter(
    x="Date",
    y="Close",
    title="Netflix Stock Over the Years"
)
netflix_plot

In [4]:
# Create lagged features
data['Lag1_Close'] = data['Close'].shift(1)
data['Lag2_Close'] = data['Close'].shift(2)

# Create additional features (e.g., volatility, daily return)
data['Volatility'] = (data['High'] - data['Low']) / data['Open']
data['Daily_Return'] = data['Close'].pct_change()

# Create target variables
data['Target_1D'] = data['Close'].shift(-1)
data['Target_1M'] = data['Close'].shift(-21)
data['Target_1Y'] = data['Close'].shift(-252)

# Drop rows with NaN values created by shifting
data = data.dropna()


In [5]:
# Show DataFrame with changes
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Lag1_Close,Lag2_Close,Volatility,Daily_Return,Target_1D,Target_1M,Target_1Y
2,2002-05-28,1.213571,1.232143,1.157143,1.157143,1.157143,6609400,1.21,1.196429,0.061801,-0.043684,1.103571,0.988571,1.614286
3,2002-05-29,1.164286,1.164286,1.085714,1.103571,1.103571,6757800,1.157143,1.21,0.067485,-0.046297,1.071429,0.943571,1.628571
4,2002-05-30,1.107857,1.107857,1.071429,1.071429,1.071429,10154200,1.103571,1.157143,0.032881,-0.029125,1.076429,0.999286,1.607143
5,2002-05-31,1.078571,1.078571,1.071429,1.076429,1.076429,8464400,1.071429,1.103571,0.006622,0.004667,1.128571,1.027857,1.664286
6,2002-06-03,1.08,1.149286,1.076429,1.128571,1.128571,3151400,1.076429,1.071429,0.06746,0.04844,1.117857,1.172857,1.704286


In [6]:
from sklearn.model_selection import train_test_split

# Define features and targets
features = data[['Open', 'High', 'Low', 'Close', 'Volume', 'Lag1_Close', 'Lag2_Close', 'Volatility', 'Daily_Return']]
target_1m = data['Target_1M']

# Split the data

X_train_1m, X_test_1m, y_train_1m, y_test_1m = train_test_split(features, target_1m, test_size=0.2, random_state=42)



In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
model_1m = LinearRegression()

# Train the model
model_1m.fit(X_train_1m, y_train_1m)

In [8]:
# Make predictions
pred_1m = model_1m.predict(X_test_1m)

# Evaluate the Model
mse_1m = mean_squared_error(y_test_1m, pred_1m)
r2_1m = r2_score(y_test_1m, pred_1m)

print(f'1-Month Prediction - MSE: {mse_1m}, R2: {r2_1m}')

1-Month Prediction - MSE: 634.3279946111724, R2: 0.978289013385382
