In [None]:
#Import necessary libraries
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

In [None]:
#Collect historical stock data for Tesla from 2018 to 2023
def get_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

data = get_stock_data('TSLA', '2018-01-01', '2023-01-01')

In [None]:
#Data Preprocessing
data.fillna(method='ffill', inplace=True)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[['Close']])

#Creating lagged features
data['Close_Lag1'] = data['Close'].shift(1)
data['Close_Lag2'] = data['Close'].shift(2)
data.dropna(inplace=True)

In [None]:
#Feature Engineering
data['Moving_Avg_5'] = data['Close'].rolling(window=5).mean()
data['Volatility'] = data['Close'].rolling(window=5).std()

#Removing NA rows after feature engineering
data.dropna(inplace=True)

#Model Selection and training Models
X = data[['Close_Lag1', 'Close_Lag2', 'Moving_Avg_5', 'Volatility']]
y = data['Close']

In [None]:
#Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

#Applying linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

#Using Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

#Using LSTM Model
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    LSTM(50),
    Dense(1)
])
X_train_lstm = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_lstm = X_test.values.reshape(-1, X_test.shape[1], 1)
lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=1)

In [None]:
#Evaluation Metrics
def evaluate_model(model, X_test, y_test, is_lstm=False):
    if is_lstm:
        preds = model.predict(X_test.reshape(-1, X_test.shape[1], 1))
    else:
        preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    return rmse, mae

rmse_lin, mae_lin = evaluate_model(lin_reg, X_test, y_test)
rmse_rf, mae_rf = evaluate_model(rf, X_test, y_test)
rmse_lstm, mae_lstm = evaluate_model(lstm_model, X_test_lstm, y_test, is_lstm=True)

print("Linear Regression - RMSE:", rmse_lin, "MAE:", mae_lin)
print("Random Forest - RMSE:", rmse_rf, "MAE:", mae_rf)
print("LSTM - RMSE:", rmse_lstm, "MAE:", mae_lstm)

In [None]:
#Validation using Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)
cv_scores_rf = cross_val_score(rf, X, y, cv=tscv, scoring='neg_mean_squared_error')
print("Random Forest Cross-Validation RMSE:", np.sqrt(-cv_scores_rf.mean()))

#Interpretation
#Feature importance for Random Forest
feature_importances = rf.feature_importances_
for i, feature in enumerate(X.columns):
    print(f'Feature: {feature}, Importance: {feature_importances[i]}')

In [None]:
#Comparing the model predictions with actual market data
plt.plot(data.index[-len(y_test):], y_test, label='Actual Price')
plt.plot(data.index[-len(y_test):], rf.predict(X_test), label='RF Predictions')
plt.xlabel("Date")
plt.ylabel("Stock Price")
plt.title("Tesla Stock Price Prediction vs Actual")
plt.legend()
plt.show()