In [None]:
from subprocess import run, PIPE
import sys

# List of modules to upgrade
modules = [
    'pandas_market_calendars',
    'plotly', 
    'numpy',
    'scikit-learn',
    'pandas',
    'matplotlib',
    'mplfinance'
]

# Construct the pip install command
command = f'pip install {" ".join(modules)} --upgrade --user --no-input'

# Run the command
try:
    proc = run(command, shell=True, text=True, stdout=PIPE, stderr=PIPE, timeout=120)

    # Check if the installation was successful
    if proc.returncode == 0:
        print("All modules upgraded successfully.")
    else:
        print("Error upgrading modules:")
        print(proc.stderr)
except Exception as e:
    print(f"An error occurred: {e}")
# Manual Step: Please restart the Kernel 

In [2]:
# Grade 1
import pandas as pd
import matplotlib as plt
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
# Download and extract the CSV file from the ZIP
conversion_data_url = "https://people.arcada.fi/~parland/hjd5_8amp_Gt3/EURUSD1m.zip"
conversion_data_df = pd.read_csv(conversion_data_url, compression='zip')

# Coverting it to string
conversion_data_df['Date'] = conversion_data_df['Date'].astype(str)
conversion_data_df['Timestamp'] = conversion_data_df['Timestamp'].astype(str)

# combine them and convert them into a single DatetimeIndex.
conversion_data_df['Datetime'] = pd.to_datetime(conversion_data_df['Date'] + ' ' + conversion_data_df['Timestamp'])
# Drop the original 'Date' and 'Time' columns as not needed anymore
conversion_data_df.drop(columns=['Date', 'Timestamp'], inplace=True)

# Set the 'Datetime' column as the index
conversion_data_df.set_index('Datetime', inplace=True)

# adding th shift column by shifting the Close value one step forward
min_to_day_df = conversion_data_df.resample('D').agg({'Open': 'first','High' : 'max' , 'Low': 'min','Close': 'last', 'Volume': 'sum' })


# Apply forward fill to handle missing values for Saturdays
min_to_day_df['Open'] = min_to_day_df['Open'].ffill()
min_to_day_df['High'] = min_to_day_df['High'].ffill()
min_to_day_df['Low'] = min_to_day_df['Low'].ffill()
min_to_day_df['Close'] = min_to_day_df['Close'].ffill()
min_to_day_df['Volume'] = min_to_day_df['Volume'].ffill()

# copying the entire raw dataframe to a new dataframe for week ahead data calculation 
one_week_ahead_df = min_to_day_df.copy()

# Creating a 'Label' column for the forecast by shifting the Close value one step forward.
min_to_day_df['Label'] = min_to_day_df['Close'].shift(-1)
min_to_day_df.dropna(subset=['Label'], inplace=True)

In [3]:
# Defining a split function for splitting the data in training and test : 80 20
def split_data(data_df):
    total_df_length  = len(data_df)
    train_data_len = int(total_df_length*0.8)
    test_data_len = total_df_length - train_data_len
    train_data = data_df.iloc[: train_data_len]
    test_data = data_df.iloc[train_data_len :]
    print(f'{total_df_length} {train_data_len} {test_data_len}')
    return train_data, test_data

In [4]:
# Storing Actual Close and 1 day ahead Close data in separate dataframe for calculating Hit Ratio
HR_calc_df = min_to_day_df[['Close','Label']].copy()
# Drop rows where 'Label' is NaN (due to the shift)
HR_calc_df.dropna(subset=['Label'], inplace=True)
HR_calc_df.dropna(subset=['Close'], inplace=True)
one_day_train_df, one_day_test_df = split_data(HR_calc_df)

3651 2920 731


In [5]:
# Storing Actual Close and 7 days ahead Close data in separate dataframe for calculating Hit Ratio
HR_calc_week_df = one_week_ahead_df[['Close']].copy()
# Create a new column for Label, shifting the Close values 7 days ahead
HR_calc_week_df['Label'] = HR_calc_week_df['Close'].shift(-7)
HR_calc_week_df.dropna(subset=['Label'], inplace=True)
HR_calc_week_df.dropna(subset=['Close'], inplace=True)

In [6]:
# calculating Larry William’s %R
window_size = 14
min_to_day_df['highest_high'] = min_to_day_df['High'].rolling(window = window_size).max()
min_to_day_df['lowest_low'] = min_to_day_df['Low'].rolling(window = window_size).min()
min_to_day_df['%R'] = (min_to_day_df['highest_high']-min_to_day_df['Close'])/(min_to_day_df['highest_high']-min_to_day_df['lowest_low'])*-100
min_to_day_df.drop(columns=['highest_high', 'lowest_low'], inplace=True)
min_to_day_df = min_to_day_df.dropna(subset=['%R'])
min_to_day_df = min_to_day_df.dropna(subset=['Label'])

In [None]:
# Calling the Split function min_to_day_df (which has one day forecast label column) to split in training and test
train_data_df, test_data_df = split_data(min_to_day_df)

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Defining a function to standardize the splitted data : train and test & return the standardized train and test dataframe
# Note: this function will not standardize/transform the label column
def standardized_train_test_data(train_dataset, test_dataset):
    if '%R' in train_dataset.columns and '%R' in test_dataset.columns:
        # Separate features and labels for train
        train_feature_df = train_dataset[['Close', '%R']].copy()  
        test_feature_df = test_dataset[['Close', '%R']].copy()
       
    else:
        train_feature_df = train_dataset[['Close']].copy()
        test_feature_df = test_dataset[['Close']].copy()
    
    # Separate features and labels for test data
    test_label_df = test_dataset['Label']
    train_label_df = train_dataset['Label']

    # Instantiate the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    train_scaled_array = scaler.fit_transform(train_feature_df)
    # Convert scaled numpy arrays back to DataFrames
    train_scaled_df = pd.DataFrame(train_scaled_array, columns=train_feature_df.columns, index=train_feature_df.index)
    
    # Transform the test data using the same scaler
    test_scaled_array = scaler.transform(test_feature_df)
    # Convert scaled numpy arrays back to DataFrames
    test_scaled_df = pd.DataFrame(test_scaled_array, columns=test_feature_df.columns, index=test_feature_df.index)
    
    # Reattach the labels to the scaled features as Label is the target column, so it's not standardized
    train_final_df = train_scaled_df.copy()
    train_final_df['Label'] = train_label_df

    test_final_df = test_scaled_df.copy()
    test_final_df['Label'] = test_label_df

    pd.set_option('display.float_format', '{:.6f}'.format)
    return train_final_df, test_final_df,scaler

# calling the function and saving the standardized train and test dataframes
train_final_df, test_final_df, scaling_param = standardized_train_test_data(train_data_df, test_data_df)


In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Defining a function to fit and run : LinearRegression Model
# This model will return the label, model predicted value, df: actual and predicted close value and R² for test and training
def train_and_evaluate_model(train_data, test_data):
    # Initialize the model based on the chosen type
    if '%R' in train_data.columns and '%R' in test_data.columns:
        X_train = train_data[['Close', '%R']]
        y_train = train_data['Label']
        X_test = test_data[['Close', '%R']]
        y_test = test_data['Label']
    else:
        X_train = train_data[['Close']]
        y_train = train_data['Label']
        X_test = test_data[['Close']]
        y_test = test_data['Label']
    model = LinearRegression()    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    # Make predictions on the test data
    y_test_pred = model.predict(X_test)
    # Make predictions on the train data
    y_train_pred = model.predict(X_train)
    # Calculate R² score/ R2 Error on test
    r2_test = r2_score(y_test, y_test_pred)
    # Calculate R² score on train
    r2_train = r2_score(y_train,y_train_pred)

    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_test_pred
    })
    return r2_train,r2_test,results_df, y_test, y_test_pred

# Train and evaluate the Linear Regression model
r2_train , r2_test, comparison , y_test, y_test_pred = train_and_evaluate_model(train_final_df, test_final_df)
print(f'R2 score  on test data is: {r2_test:.4f}')
print(f'R2 score  on train data is: {r2_train:.4f}')

### Grade 1:
### 1.Compare the R² errors for test and train and explain the outcome.
### Explanation:
- **Training R² (0.9970)**: This means that the model can explain 99.70% of the variation in the training data. The model has learned the relationships between the inputs (Close and %R) and the target (Label) very well for the data it was trained on.
- **Test R² (0.9913)**: This means that when I tested the model on new, unseen data, it can still explain 99.13% of the variation. This means that the model can predict well on new data it hasn't seen before, which is the ultimate goal.
Key points:
1. It's normal for the training R² to be slightly higher than the test R² because:
    - The model has already seen the training data and learned from it, so it fits that data very well.
    - The test data is new to the model, and since it hasn't encountered this data before, it might not fit it perfectly.
2. However, in this case, the test R² is still very close to the training R², which is a positive sign!
3. The small difference between the two R² scores (0.9970 vs. 0.9913) indicates that the model isn’t overfitting and is making accurate predictions on both the training and test data. 

In [10]:
# Adding column for LinearRegression model predicted values
one_day_test_df['Predicted'] = comparison['Predicted'].copy()
one_day_test_df.dropna(subset=['Predicted'], inplace=True)

In [None]:
from sklearn.linear_model import LinearRegression

# Dropping the %R column from the standardized data for train and test (label not stanardized) to 
train_final_df = train_final_df.drop(columns=['%R'], errors='ignore')
test_final_df = test_final_df.drop(columns=['%R'], errors='ignore')

# Train and evaluate the Linear Regression model without %R
r2_train , r2_test, Actual_Predicted_df, y_test, y_test_pred = train_and_evaluate_model(train_final_df, test_final_df)
print(f'R2 score  on test data is: {r2_test:.4f}')
print(f'R2 score  on train data is: {r2_train:.4f}')

### Extra: Test your model (get R² errors for test and train without LW%R, just Close column). Comment and explain the result.
### Explanation
The R² scores remained unchanged after removing `%R` because the correlation between `Close` and `%R` is very weak, indicating that `%R` doesn’t add
significant value to the model. Since `%R` is not providing additional predictive power, the `Close` value alone captures most of the necessary 
information for accurate predictions. The model heavily relies on `Close`, which is already a strong predictor. Consequently, `%R` has minimal impact 
on the model’s performance, so its removal doesn’t affect the R² scores. This explains why both the training and test R² scores remained high and 
consistent.

In [None]:
import matplotlib.pyplot as plt

# Below graph shows the LinerarRegression model one day ahead predicted and actual close value
plt.figure(figsize=(14, 7))
plt.plot(y_test.index, y_test, label='Actual Values', color='blue')
plt.plot(y_test.index, y_test_pred, label='Predicted Values', color='yellow')
plt.xlabel('Date')
plt.ylabel('Close Value')
plt.title('Actual vs. Predicted Close Values')
plt.legend()
plt.show()

In [13]:
# Grade 2
# calculate stochastic %D
import pandas as pd

# Define the window sizes
window_size_k = 14  # Period for %K calculation
window_size_d = 3   # Period for smoothing %K to get Slow %K and Slow %D

# Calculate %K
min_to_day_df['highest_high'] = min_to_day_df['High'].rolling(window=window_size_k, min_periods=1).max()
min_to_day_df['lowest_low'] = min_to_day_df['Low'].rolling(window=window_size_k, min_periods=1).min()
min_to_day_df['%K'] = (min_to_day_df['Close'] - min_to_day_df['lowest_low']) / (min_to_day_df['highest_high'] - min_to_day_df['lowest_low']) * 100

# Drop temporary columns used for calculation
min_to_day_df.drop(columns=['highest_high', 'lowest_low'], inplace=True)

# Calculate Slow %K by applying a moving average to %K
min_to_day_df['Slow %K'] = min_to_day_df['%K'].rolling(window=window_size_d, min_periods=1).mean()

# Calculate Slow %D by applying a moving average to Slow %K
min_to_day_df['Slow %D'] = min_to_day_df['Slow %K'].rolling(window=window_size_d, min_periods=1).mean()

# Drop rows with NaN values if any (usually at the start of the series)
min_to_day_df.dropna(subset=['Slow %D'], inplace=True)

min_to_day_df.drop(columns=['Slow %K' , '%K'], inplace=True)
# Display the DataFrame with the new feature

In [None]:
# Calling split data function again for further calculation
train_data_df, test_data_df = split_data(min_to_day_df)

In [15]:
# Adding new columns in test dataframe for plotting purpose
test_data_df['Predicted'] = y_test_pred  # Add predicted values to the DataFrame  
test_data_df.rename(columns={'Label': 'Actual Data', 'Predicted': 'Forecast Data'}, inplace=True)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mplfinance.original_flavor import candlestick_ohlc
import pandas as pd

# Creating few functions below for plotting: OHLC candle with Actual and Predicted close value, %R, Slow %D, Calculating and plotting RSI.

# Function to plot OHLC chart
def plot_ohlc(test_data_df):
    ohlc_data = []
    for i in range(len(test_data_df)):
        date = mdates.date2num(test_data_df.index[i].to_pydatetime())  # Convert date to matplotlib date format
        open_value = test_data_df['Open'].iloc[i]
        high_value = test_data_df['High'].iloc[i]
        low_value = test_data_df['Low'].iloc[i]
        close_value = test_data_df['Close'].iloc[i]
        ohlc_data.append((date, open_value, high_value, low_value, close_value))
    return ohlc_data

# This function will show the OHLC candles and Actual and predicted close value in the same figure
def plot_ohlc_actual_vs_predicted(test_data_df):
    # Use a limited view for clarity
    recent_data = test_data_df.iloc[1:150]  # Only the first 150 entries for clearer view
    fig, ax = plt.subplots(figsize=(16, 8))  # Increase figure size for better visibility

    # Plot OHLC
    ohlc_data = plot_ohlc(recent_data)
    ax2 = ax.twinx()  # Create a twin axis for the candlestick chart
    candlestick_ohlc(ax2, ohlc_data, width=0.2, colorup='g', colordown='r')  # Adjust candle width
    ax2.xaxis_date()
    ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

    # Adjust y-limits to provide more space above and below, to see clarity in the graph
    y_min = recent_data['Low'].min() * 1.00  # Lower limit with a margin
    y_max = recent_data['High'].max() * 1.05  # Upper limit with a margin
    ax2.set_ylim(y_min, y_max)

    ax2.set_title('OHLC Candlestick Chart and Actual vs. Predicted Close Values')
    ax2.set_ylabel('Value')
    ax2.grid()

    # Plot Actual vs. Predicted on the same axis
    ax.plot(recent_data.index, recent_data['Actual Data'], label='Actual Close', color='blue', linewidth=2)
    ax.plot(recent_data.index, recent_data['Forecast Data'], label='Predicted Close', color='orange', linestyle='--', linewidth=2)

    ax.xaxis_date()
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.set_ylabel('Value')
    ax.grid()
    ax.legend()
    plt.tight_layout()
    plt.show()

# This function will show the %R and Slow %D 
def plot_r_and_d(test_data_df):
    fig, ax = plt.subplots(figsize=(14, 7))
    ax.plot(test_data_df.index, test_data_df['%R'], color='yellow', label='LW%R')
    ax.plot(test_data_df.index, test_data_df['Slow %D'], color='purple', label='Slow %D')
    ax.xaxis_date()
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.set_title('LW%R and Slow %D')
    ax.set_ylabel('Indicator Values')
    ax.grid()
    ax.legend()
    plt.show()

# Function calling

# Create a figure based on OHLC candles covering the test period (the 20% of data)
# Add a line to the chart that illustrates the label (actual data) and the forecast (so candels and lines are in the same figure. 
plot_ohlc_actual_vs_predicted(test_data_df)

# Add subplot(s) with the LW%R and Stochastic slow %D features
plot_r_and_d(test_data_df)

### Grade 2:
### 1. What patterns can you observe from the line figure?
### Explanation
- **Two Lines: Actual(label: Actual one day ahead close value) and Predicted Close value:**
The Actual and Predicted lines are very close to each other means the model is doing a good job of predicting the close value. When the blue line 
(actual value) moves up or down, the orange line (predicted value) also follows in the same direction.
- There are some places where the orange dashed line does not match exactly with the blue line like around February 2018, there is a noticeable gap where the orange dashed line is higher than the blue line. The differences can be seen where the lines don't overlap exactly, but these are small and show the model was still close in predicting overall market trends.
- **Candlestick Chart:**
    - Below the two lines, there is a candlestick chart (green and red bars). Each candlestick represents the movement of value for a day.
    - Green candles mean the value went up during a day.
    - Red candles mean the value went down during that day.
    - The candlestick chart generally follows the same up-and-down movement as the lines above. This shows that your model's predictions reflect the real changes happening in the market.
- If we talk about the trend then both the actual and predicted lines show similar patterns. When the value is trending upwards (going up), the predicted line also goes up. When the value is trending downwards, the predicted line also follows the downtrend means model is good at predicting general trends
in the market.

In [None]:
#  Function to calculate RSI
def calculate_rsi(data):
    window_size = 14
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window_size).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window_size).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Calculate additional feature RSI (relative strength index)
test_data_df['RSI'] = calculate_rsi(test_data_df)

# Function to plot RSI
def plot_rsi(test_data_df, window=14):
    # Calculate RSI
    test_data_df['RSI'] = calculate_rsi(test_data_df)
    fig, ax = plt.subplots(figsize=(14, 7))
    ax.plot(test_data_df.index, test_data_df['RSI'], color='cyan', label='RSI')
    ax.set_title('Relative Strength Index (RSI)')
    ax.set_ylabel('RSI Value')
    ax.xaxis_date()
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))  # Format dates
    ax.grid()
    ax.legend(['RSI'])
    
    plt.tight_layout()
    plt.show()

# Add the feature as a subplot to the figure in the previos step
plot_rsi(test_data_df)


In [None]:
# Set up an ElasticNet (not an ElasticNetCV) model
from sklearn.linear_model import ElasticNet

# Defining model to train and run: ElasticNet and calculate R² Error on both the training data set and the test
def elasticNet_train_and_predict(train_data, test_data, alpha_param,l1_ratio_param):

    X_train = train_data[['Close']]
    y_train = train_data['Label']
    X_test = test_data[['Close']]
    y_test = test_data['Label']
    # Initialize the ElasticNet model with regularization
    elasticNet = ElasticNet(alpha=alpha_param, l1_ratio =l1_ratio_param)

    # training the data on elasticNet model
    elasticNet.fit(X_train, y_train)
    # Make predictions
    # y_pred is y_test_elastic_pred
    y_test_elastic_p = elasticNet.predict(X_test) 

# Calculate R² score/ R2 Error on test
    r_square_5 = r2_score(y_test, y_test_elastic_p)
    print(f'R² score on test data: {r_square_5:.4f}')
    y_train_elastic_pred = elasticNet.predict(X_train)

# Calculate R² score on train
    r_square_6 = r2_score(y_train,y_train_elastic_pred)
    print(f'R² score on train data: {r_square_6:.4f}')
    test_data_df['ElasticNet_Predicted'] = y_test_elastic_p
    elastic_results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_test_elastic_p
    })
    return y_test, y_test_elastic_p, elastic_results_df


In [19]:
# Function to plot Actual and Predicted values
def plot_actual_vs_predicted(test_data_df):
    fig, ax = plt.subplots(figsize=(14, 7))
    # Plot Actual vs. Predicted on the same axis
    ax.plot(test_data_df.index, test_data_df['Actual Data'], label='Actual', color='blue')
    ax.plot(test_data_df.index, test_data_df['Forecast Data'], label='Predicted', color='orange')
    ax.plot(test_data_df.index, test_data_df['ElasticNet_Predicted'], label='ElasticNet Predicted', color='green')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.set_ylabel('Value')
    ax.grid()
    ax.legend()
    plt.show()

In [None]:
# The suitable values for alpha and l1_ratio which fits the model are alpha=0.01(relarization very low) and l1_ratio= 0.5
y_test, y_test_elastic_p, elastic_results_df =elasticNet_train_and_predict(train_final_df, test_final_df, 0.01,0.5)

#Plotting with alpha value too low (0.01)
plot_actual_vs_predicted(test_data_df)

### Grade 3:  
### 1. Compare the errors and explain the outcome.
### Explanation
**Linear Regression R² Error on test is = 0.9913:**
- which indicates a very strong fit to the training data. The model explains 99.13% of the variance in the closing value, which is excellent.
- The predicted values closely align with the actual closing values and hence the plot is accurately showing the trend.
  
**ElasticNet R² Error on test is = 0.9746:**
- In this case  (Alpha = 0.01, L1_ratio = 0.5): Alpha=0.01 means this is a small regularization strength. It allows the model to learn the data well without too much restriction, enabling it to fit both the training and test data effectively.

**Conclusion**

Linear Regression delivers high performance but risks overfitting due to its lack of regularization.
ElasticNet gives a balance between fitting the data well and preventing overfitting with small regularization, making it a reliable model, particularly when the dataset is complex or noisy.

Both models are performing excellently, but ElasticNet may provide a safer choice for unseen data where overfitting could be a concern.

In [21]:
# Grade 4
# Calculating the On-Balance Volume (OBV)
min_to_day_df['OBV'] = 0.0
for i in range(1, len(min_to_day_df)):
    if min_to_day_df['Close'].iloc[i] > min_to_day_df['Close'].iloc[i - 1]:
        min_to_day_df.loc[min_to_day_df.index[i], 'OBV'] = min_to_day_df.loc[min_to_day_df.index[i - 1], 'OBV'] + min_to_day_df.loc[min_to_day_df.index[i], 'Volume']
    elif min_to_day_df['Close'].iloc[i] < min_to_day_df['Close'].iloc[i - 1]:
        min_to_day_df.loc[min_to_day_df.index[i], 'OBV'] = min_to_day_df.loc[min_to_day_df.index[i - 1], 'OBV'] - min_to_day_df.loc[min_to_day_df.index[i], 'Volume']
    else:
        min_to_day_df.loc[min_to_day_df.index[i], 'OBV'] = min_to_day_df.loc[min_to_day_df.index[i - 1], 'OBV']

min_to_day_df['On_Balance_Vol'] = min_to_day_df['OBV']
min_to_day_df.drop(columns=['OBV'],inplace=True)

In [22]:
import numpy as np

# Defining a create slide window function
def create_sliding_windows(data, window_size):
    features = data[['Close']].values
    labels = data['Label'].values            # Extract labels (1D array)
    
    X, y = [], []
    
    for i in range(len(data) - window_size + 1):
        # Create windowed data
        window = features[i:i + window_size]
        
        # Reshape the window into a single vector
        window_vector = window.flatten()
        
        # Append the reshaped window and associated label (last label in window)
        X.append(window_vector)
        y.append(labels[i + window_size - 1])
    
    return np.array(X), np.array(y)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
# Loop through polynomial degrees and window sizes
results_summary = []

# Apply polynomial transformation on training data : No slide window case
poly = PolynomialFeatures(degree=2)
X_no_window_train = train_final_df[['Close']].values
X_poly_no_window_train = poly.fit_transform(X_no_window_train)
    
# Fit the linear regression model on training data
model = LinearRegression()
model.fit(X_poly_no_window_train, train_final_df['Label'])

# Apply polynomial transformation on test data
X_no_window_test = test_final_df[['Close']].values
X_poly_no_window_test = poly.transform(X_no_window_test)

# Make predictions on test data
y_no_window_test_pred = model.predict(X_poly_no_window_test)
r2_no_window_test = r2_score(test_final_df['Label'], y_no_window_test_pred)

# Append the results for both training and test set in the results summary
results_summary.append({
    'Model': f'Polynomial Reg Deg 2', 
    'Window Size': 'No Window', 
    'R²': r2_no_window_test
    })


# Loop through polynomial degrees and window sizes 
for window_size in [2, 5, 10]:
    # Create sliding windows for training data
    X_train, y_train = create_sliding_windows(train_final_df, window_size)

    # Fit the polynomial regression model
    poly = PolynomialFeatures(degree=2)
    X_poly_train = poly.fit_transform(X_train)

    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X_poly_train, y_train)

     # Create sliding windows for testing data
    X_test, y_test = create_sliding_windows(test_final_df, window_size)

    # Transform test data
    X_poly_test = poly.transform(X_test)

    # Make predictions on test data
    y_test_pred = model.predict(X_poly_test)
    
    # Calculate R² score
    r2 = r2_score(y_test, y_test_pred)

    # Collect the results in a list
    results_summary.append({
    'Model': f'Polynomial Reg Deg 2', 
        'Window Size': window_size, 
        'R²': r2
        })

results_summary_df = pd.DataFrame(results_summary)
print(results_summary_df)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
# Loop through polynomial degrees and window sizes
results_summary = []

# Defining a function to show results
def show_slide_window_results(model_name, window_size, r2):
    results_summary.append({
        'Model': model_name, 
        'Window Size': 'No Window' if window_size is None else window_size, 
        'R²': r2
    })

# Defining all three models: LinearRegression, ElasticNet and Polynomial Regression with window size: 2,5,10
def fit_run_three_models(train_data_df, test_data_df):
    for window_size in [None, 2, 5, 10]:
        # If window size is None, use the entire dataset without sliding windows
        if window_size is None:
            X_train = train_data_df[['Close']].values
            y_train = train_data_df['Label'].values
            X_test = test_data_df[['Close']].values
            y_test = test_data_df['Label'].values
        else:
            # Create sliding windows for training data
            X_train, y_train = create_sliding_windows(train_data_df, window_size)

            # Create sliding windows for testing data
            X_test, y_test = create_sliding_windows(test_data_df, window_size)

        # Fit the polynomial regression model
        poly = PolynomialFeatures(degree=2)
        X_poly_train = poly.fit_transform(X_train)

        # Fit the linear regression model
        model = LinearRegression()
        model.fit(X_poly_train, y_train)

        # Transform test data
        X_poly_test = poly.transform(X_test)

        # Make predictions on test data
        y_test_pred = model.predict(X_poly_test)

        # Calculate R² score
        r2_poly_reg = r2_score(y_test, y_test_pred)

        show_slide_window_results('Polynomial Reg', window_size, r2_poly_reg)



        # --- Linear Regression ---
        # Fit the linear regression model directly on original features
        lin_model = LinearRegression()
        lin_model.fit(X_train, y_train)

        # Make predictions on the test data (linear regression)
        y_lin_test_pred = lin_model.predict(X_test)

        # Calculate the R² score for linear regression
        r2_linear_reg = r2_score(y_test, y_lin_test_pred)

        show_slide_window_results('Linear Reg', window_size, r2_linear_reg)


       # --- ElasticNet Regression ---
        # Fit the ElasticNet model
        elastic_net_model = ElasticNet(alpha=0.01, l1_ratio=0.5)  # You can adjust alpha and l1_ratio
        elastic_net_model.fit(X_train, y_train)

        # Make predictions on the test data (ElasticNet)
        y_en_test_pred = elastic_net_model.predict(X_test)

        # Calculate the R² score for ElasticNet
        r2_elasticNet = r2_score(y_test, y_en_test_pred)

        # Append ElasticNet results to summary
        show_slide_window_results('ElasticNet', window_size, r2_elasticNet)

# Calling the function for fit and run three models with window size and showing the results
fit_run_three_models(train_final_df, test_final_df)
results_summary_df = pd.DataFrame(results_summary)
print(results_summary_df)

### Grade 4:
### 1. Summarize and compare their R² error measures. Is anyone better than the LinearRegression model without window information attached?
### Explanation 

**Comparison Analysis:**
- **Polynomial Regression without Window (R² = 0.991263):** performs slightly worse than Linear Regression without window(R² = 0.991294).
Among the different window sizes for Linear and Polynomial Regression, **the Linear Regression with a window size of 2 (R² = 0.991309)** performs the best in that group.
ElasticNet consistently has lower R² values across all R² values compared to Linear Regression and Polynomial Regression.

**Conclusion:**

The Linear Regression (Window 2) model has the best performance in terms of R², outperforming all polynomial and ElasticNet models. Therefore, the Linear Regression model (Window 2) is the best option among those tested, followed closely by the **Linear Regression without window 
(0.991294)** and **Polynomial Regression without Window (R² = 0.991263)**

In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# defining the function for window wise dataframes with close, label, predicted and difference column for calculating the Hit ratio for each window.
def polynomial_regression_with_sliding_windows(train_df, test_df, window_size, degree=2):
    # Create sliding windows for training data
    X_train, y_train = create_sliding_windows(train_final_df, window_size)

    # Fit the polynomial regression model
    poly = PolynomialFeatures(degree=degree)
    X_poly_train = poly.fit_transform(X_train)

    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X_poly_train, y_train)

    # Create sliding windows for testing data
    X_test, y_test = create_sliding_windows(test_final_df, window_size)

    # Transform test data
    X_poly_test = poly.transform(X_test)

    # Make predictions on test data
    y_test_pred = model.predict(X_poly_test)

    # Ensure you have the right index for the predictions
    start_index = window_size  # Start index for the actual close values
    predicted_length = len(y_test_pred)  # Number of predictions made by the model

    # Extract actual close values based on the start index and predicted length
    actual_close_values = test_df['Close'].values[start_index:start_index + predicted_length]

    # Adjust lengths if they don't match
    if len(y_test_pred) != len(actual_close_values):
        min_length = min(len(y_test_pred), len(actual_close_values))

        # Truncate to the minimum length
        y_test_pred = y_test_pred[:min_length]
        actual_close_values = actual_close_values[:min_length]
        y_test = y_test[:min_length]  # Also adjust y_test for consistency

    # Create a DataFrame for test results
    results_df = pd.DataFrame({
        'Close': actual_close_values,
        'Label': y_test,
        'Predicted': y_test_pred,
    })

    # Calculate the difference directly in the DataFrame
    results_df['Difference'] = results_df['Predicted'] - results_df['Close']

    return results_df

# Initialize an empty dictionary to store DataFrames
dataframes = {}

# Loop through the desired window sizes and create DataFrames
for window_size in [2, 5, 10]:
    dataframes[window_size] = polynomial_regression_with_sliding_windows(train_data_df, test_data_df, window_size)

# Accessing DataFrames for each window size
df_window_2 = dataframes[2]
df_window_5 = dataframes[5]
df_window_10 = dataframes[10]


In [None]:
# Grade 5
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(one_day_test_df['Label'], one_day_test_df['Predicted'])
print("Mean Absolute Error:", mae)
one_day_test_df['Difference'] = one_day_test_df['Predicted'] - one_day_test_df['Close']

# Defining a function for decision making based on value difference
def make_decision(df):
    if 0.00001<= df['Difference'] <= 0.000015:
        return 'Hold'
    elif df['Difference'] <  0.00001 :
        return 'Sell'
    else:
        return 'Buy'

one_day_test_df['Decision'] = one_day_test_df.apply(make_decision, axis=1)

### Grade 5
### 1. Compare the regression forecast with the known Close price.
### Explanation 
model’s predictions are off by a very small amount—0.00257—compared to the actual value of next day close price(Label).
This indicates the model is predicting very accurately, as the error is quite minimal relative to the actual close price.


In [27]:
def calculate_show_hit_ratio(data_df,forecast_type):
    correct_prediction = calculating_correct_prediction(data_df)
    #  Get the correct predictions 'True' count for the Hit Ratio
    correct_prediction_count = correct_prediction['Correct Prediction'].value_counts().get('True', 0)
    total_predictions_count = len(correct_prediction['Decision'])
    print(f'{correct_prediction_count} {total_predictions_count}')
    print(f'hit ratio for {forecast_type}  is {correct_prediction_count/total_predictions_count: .2f}')

In [None]:
# Defining a function to calculate the correct prediction
def calculating_correct_prediction(data_df):
    correct_predictions = []
    
    for i in range(0, len(data_df)):
        decision = data_df.iloc[i]['Decision']
        actual_next_day_close = data_df.iloc[i]['Label']  # Actual closing value for the next day
        actual_current_close = data_df.iloc[i]['Close']   # Current day's closing value
        model_predicted_close = data_df.iloc[i]['Predicted']  # Predicted closing value

        # Evaluate predictions based on the decision made
        if decision == 'Buy':
            if actual_next_day_close > actual_current_close:  # Next day value is higher than current day value
                correct_predictions.append('True')
            else:
                correct_predictions.append('False')
        
        elif decision == 'Sell':
            if actual_next_day_close < actual_current_close:  # Next day close value is lower than current day value
                correct_predictions.append('True')
            else:
                correct_predictions.append('False')
        
        elif decision == 'Hold':
            # Check if the predicted value is close enough to the actual next day's value
            difference = abs(model_predicted_close - actual_next_day_close)
            if 0.00001 <= difference <= 0.000015:  # Threshold for Hold condition (adjust if needed)
                correct_predictions.append('True')
            else:
                correct_predictions.append('False')
    
    # Assign the list to the new column in the DataFrame after the loop
    data_df['Correct Prediction'] = correct_predictions

    # Return the DataFrame (optional)
    return data_df

# hit ratio for one day Forecast  is
calculate_show_hit_ratio(one_day_test_df,'One Day')

In [None]:
# Calculate the Hit Ratio (HR) of your investment decision for each of the windows. 
df_window_2['Decision'] = df_window_2.apply(make_decision, axis=1)
df_window_5['Decision'] = df_window_5.apply(make_decision, axis=1)
df_window_10['Decision'] = df_window_10.apply(make_decision, axis=1)
calculate_show_hit_ratio(df_window_2,'window 2')
calculate_show_hit_ratio(df_window_5,'window 5')
calculate_show_hit_ratio(df_window_10,'window 10')

In [None]:
# HR_calc_week_df contains the weekly close (label data) and close value for same day and below will split in train and test
weekly_train_data_df, weekly_test_data_df = split_data(HR_calc_week_df)

In [31]:
# Standardize the one week ahead train and test data except Label(target column)
weekly_train_final, weekly_test_final, scalar = standardized_train_test_data(weekly_train_data_df, weekly_test_data_df)

In [32]:
# Train and evaluate the Linear Regression model
r2_train , r2_test, week_ahead_actual_predicted , y_test, y_test_pred = train_and_evaluate_model(weekly_train_final, weekly_test_final)

In [33]:
# Reset the index of week_ahead_actual_predicted to align by position
week_ahead_actual_predicted.reset_index(drop=True, inplace=True)

In [34]:
# Assign the 'Predicted' values to weekly_test_data_df
weekly_test_data_df['Predicted'] = week_ahead_actual_predicted['Predicted'].values

In [35]:
weekly_test_data_df['Difference'] = weekly_test_data_df['Predicted'] - weekly_test_data_df['Close']
weekly_test_data_df['Decision'] = weekly_test_data_df.apply(make_decision, axis=1)

In [None]:
# Calling the hit ratio function on one week ahead forecast data
calculate_show_hit_ratio(weekly_test_data_df, 'weekly Forecast')

### 2. Which setup was the best, and why was that?
### Explanation
**Hit Ratio:**
 - **Weekly Forecast: 0.49**
    
 - **One Day Forecast: 0.45**

In this case, the weekly forecast has a hit ratio of 0.49, meaning that 49% of its predictions were correct.
The one-day forecast has a slightly lower hit ratio of 0.45, indicating that 45% of its predictions were correct.

**Conclusion**

Based on the provided hit ratios, the weekly forecast setup was the better model for predicting outcomes. It had a higher hit ratio compared to the one-day forecast, suggesting that it produced more accurate predictions over the longer time horizon.

However, the sliding window forecasts (window sizes of 2, 5, and 10) had a hit ratio of 0,so this approach failed to make accurate predictions. Thus, focusing on the weekly forecast may be the most reliable strategy based on this analysis.