In [1]:
#Install necessary Libraries

!pip install yfinance



In [11]:
#Import libraries

import os
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from google.colab import files

In [12]:
# Download Historical Data

# Downloading historical data for a stock (e.g. IBM) from 2017 to 2023
stock_symbol = "IBM"
stock_df = yf.download(stock_symbol, start="2017-01-01", end="2023-12-31")

# Saved the data to a CSV file
csv_file_name = 'IBM_stock_data.csv'
stock_df.to_csv(csv_file_name)

# Confirm the file was created and print the current working directory
print(f" CSV file saved as: {os.path.abspath(csv_file_name)}")
print("Current working directory:", os.getcwd())

# Download the CSV file to your local machine
files.download(csv_file_name)

[*********************100%***********************]  1 of 1 completed

 CSV file saved as: /content/IBM_stock_data.csv
Current working directory: /content





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
#1. Preprocess and clean the data

# Display the first few rows of the dataframe
print(stock_df.head())

# Check for missing values
print(stock_df.isnull().sum())

# Drop missing values if any
stock_df.dropna(inplace=True)

# Check the cleaned data
print(stock_df.head())

Price                       Adj Close       Close        High         Low  \
Ticker                            IBM         IBM         IBM         IBM   
Date                                                                        
2017-01-03 00:00:00+00:00  112.199348  159.837479  160.487579  158.709366   
2017-01-04 00:00:00+00:00  113.588501  161.816437  162.399612  160.000000   
2017-01-05 00:00:00+00:00  113.212654  161.281067  161.940720  159.904404   
2017-01-06 00:00:00+00:00  113.769676  162.074570  162.447418  160.152969   
2017-01-09 00:00:00+00:00  112.508026  160.277252  162.332703  160.248566   

Price                            Open   Volume  
Ticker                            IBM      IBM  
Date                                            
2017-01-03 00:00:00+00:00  159.655838  3069278  
2017-01-04 00:00:00+00:00  160.391968  3536944  
2017-01-05 00:00:00+00:00  161.806885  2805686  
2017-01-06 00:00:00+00:00  161.271515  3080993  
2017-01-09 00:00:00+00:00  162.017212  3

In [14]:
#2. Define Features and Labels

# Create the target variable based on the trading strategy.

# Define the target variable 'signal' based on the closing price strategy : +1 for buy signal, -1 for sell signal
stock_df['Signal'] = np.where(stock_df['Close'].shift(-1) > stock_df['Close'], 1, -1)

# Drop rows with NaN values that may have been created due to shifting
stock_df.dropna(inplace=True)

# Define feature variables 'X' and target variable 'y'
X = stock_df[['Open', 'High', 'Low', 'Close', 'Volume']]  # Select features
y = stock_df['Signal']  # Target variable

# Print outputs
print("Feature Variables (X):")
print(X)
print("\nTarget Variable (y):")
print(y)

Feature Variables (X):
Price                            Open        High         Low       Close  \
Ticker                            IBM         IBM         IBM         IBM   
Date                                                                        
2017-01-03 00:00:00+00:00  159.655838  160.487579  158.709366  159.837479   
2017-01-04 00:00:00+00:00  160.391968  162.399612  160.000000  161.816437   
2017-01-05 00:00:00+00:00  161.806885  161.940720  159.904404  161.281067   
2017-01-06 00:00:00+00:00  161.271515  162.447418  160.152969  162.074570   
2017-01-09 00:00:00+00:00  162.017212  162.332703  160.248566  160.277252   
...                               ...         ...         ...         ...   
2023-12-22 00:00:00+00:00  161.100006  162.410004  161.000000  162.139999   
2023-12-26 00:00:00+00:00  162.229996  163.309998  162.050003  163.210007   
2023-12-27 00:00:00+00:00  163.139999  163.639999  162.679993  163.460007   
2023-12-28 00:00:00+00:00  163.960007  163.960007  16

In [15]:
#3. Split the Data into Training and Testing Sets

# As mentioned,Split the data into training and test datasets (80/20 split)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the datasets
print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Testing target shape: {y_test.shape}")

Training features shape: (1408, 5)
Training target shape: (1408,)
Testing features shape: (352, 5)
Testing target shape: (352,)


In [22]:
#4. Choose a Classifier and Fit It.
# For this example, we'll use the Random Forest Classifier.

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier()

# Fit the Random Forest classifier on the training data
rf_classifier.fit(X_train, y_train)

In [28]:
#5. Evaluate the Random Forest Classifier on the Test Dataset
# Check the performance of the model.

# Make predictions for Random Forest Classifier
# This line applies the trained random forest classifier to the test features (X_test) and generates predictions for each instance in the test set.
y_pred = rf_classifier.predict(X_test)

# Evaluate the model for Random Forest Classifier
print("\nRandom Forest Classifier Evaluation:")

# Generate the confusion matrix for the Random Forest Classifier
# The confusion matrix summarizes the performance of the classification model by showing the counts of true positive, true negative, false positive, and false negative predictions.
rf_confusion_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix for the Random Forest Classifier
print("Confusion Matrix:\n", rf_confusion_matrix)

# Generate the classification report for the Random Forest Classifier
# The classification report provides precision, recall, F1-score, and support for each class. Setting output_dict=True allows us to capture the report in a dictionary format for easier manipulation.
rf_classification_report = classification_report(y_test, y_pred, output_dict=True)

# Print the classification report for the Random Forest Classifier
print("Classification Report:\n", rf_classification_report)

# Calculate the accuracy of the Random Forest Classifier
# accuracy_score compares the true labels (y_test) with the predicted labels (y_pred) and returns the proportion of correctly predicted instances.
rf_accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score for the Random Forest Classifier
print("Accuracy (Random Forest):", rf_accuracy)


Random Forest Classifier Evaluation:
Confusion Matrix:
 [[ 81  86]
 [ 78 107]]
Classification Report:
 {'-1': {'precision': 0.5094339622641509, 'recall': 0.48502994011976047, 'f1-score': 0.49693251533742333, 'support': 167.0}, '1': {'precision': 0.5544041450777202, 'recall': 0.5783783783783784, 'f1-score': 0.5661375661375662, 'support': 185.0}, 'accuracy': 0.5340909090909091, 'macro avg': {'precision': 0.5319190536709355, 'recall': 0.5317041592490694, 'f1-score': 0.5315350407374948, 'support': 352.0}, 'weighted avg': {'precision': 0.5330688594815097, 'recall': 0.5340909090909091, 'f1-score': 0.5333044880590893, 'support': 352.0}}
Accuracy (Random Forest): 0.5340909090909091


In [25]:
# 6.Compare Two Different Classifiers
# In order to compare, We implement a second classifier for comparison as Logistic Regression.

from sklearn.linear_model import LogisticRegression

# Initialize the Random Forest classifier
logistic_classifier = LogisticRegression()

# Fit the classifier on the training data
logistic_classifier.fit(X_train, y_train)

In [29]:
# Predictions and evaluation for Logistic Regression Classifier
# This line applies the trained logistic_classifier to the test features (X_test) and generates predictions for each instance in the test set.
y_pred_logistic = logistic_classifier.predict(X_test)

# Evaluate the model for Logistic Regression Classifier
print("\nLogistic Regression Classifier Evaluation:")

# Generate the confusion matrix for the Logistic Regression Classifier
# The confusion matrix summarizes the performance of the classification model by showing the counts of true positive, true negative, false positive, and false negative predictions.
logistic_confusion_matrix = confusion_matrix(y_test, y_pred_logistic)

# Print the confusion matrix for the Logistic Regression Classifier
print("Confusion Matrix:\n", logistic_confusion_matrix)

# Generate the classification report for the Logistic Regression Classifier
# The classification report provides precision, recall, F1-score, and support for each class. Setting output_dict=True allows us to capture the report in a dictionary format for easier manipulation.
logistic_classification_report = classification_report(y_test, y_pred_logistic, output_dict=True)

# Print the classification report for the Logistic Regression Classifier
print("Classification Report:\n", logistic_classification_report)

# Calculate the accuracy of the Logistic Regression Classifier
# accuracy_score compares the true labels (y_test) with the predicted labels (y_pred_logistic) and returns the proportion of correctly predicted instances.
logistic_accuracy = accuracy_score(y_test, y_pred_logistic)

# Print the accuracy score for the Logistic Regression Classifier
print("Accuracy (Logistic Regression):", logistic_accuracy)


Logistic Regression Classifier Evaluation:
Confusion Matrix:
 [[  3 164]
 [  1 184]]
Classification Report:
 {'-1': {'precision': 0.75, 'recall': 0.017964071856287425, 'f1-score': 0.03508771929824561, 'support': 167.0}, '1': {'precision': 0.5287356321839081, 'recall': 0.9945945945945946, 'f1-score': 0.6904315196998124, 'support': 185.0}, 'accuracy': 0.53125, 'macro avg': {'precision': 0.639367816091954, 'recall': 0.506279333225441, 'f1-score': 0.362759619499029, 'support': 352.0}, 'weighted avg': {'precision': 0.6337104885057472, 'recall': 0.53125, 'f1-score': 0.3795155689411145, 'support': 352.0}}
Accuracy (Logistic Regression): 0.53125


In [30]:
# Comparison Summary
comparison_summary = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision (1)', 'Recall (1)', 'F1-Score (1)', 'Precision (-1)', 'Recall (-1)', 'F1-Score (-1)'],
    'Random Forest': [
        rf_accuracy,
        rf_classification_report['1']['precision'],
        rf_classification_report['1']['recall'],
        rf_classification_report['1']['f1-score'],
        rf_classification_report['-1']['precision'],
        rf_classification_report['-1']['recall'],
        rf_classification_report['-1']['f1-score']
    ],
    'Logistic Regression': [
        logistic_accuracy,
        logistic_classification_report['1']['precision'],
        logistic_classification_report['1']['recall'],
        logistic_classification_report['1']['f1-score'],
        logistic_classification_report['-1']['precision'],
        logistic_classification_report['-1']['recall'],
        logistic_classification_report['-1']['f1-score']
    ]
})

print("\nComparison Summary:")
print(comparison_summary)

# Conclusion
print("\n######## Conclusion ########")
if rf_accuracy > logistic_accuracy:
    conclusion = "Random Forest Classifier outperforms Logistic Regression in terms of accuracy."
else:
    conclusion = "Logistic Regression outperforms Random Forest Classifier in terms of accuracy."

print(conclusion)

# Detailed Performance Metrics for further clarity
print("\nDetailed Performance Metrics:")
print(comparison_summary)


Comparison Summary:
           Metric  Random Forest  Logistic Regression
0        Accuracy       0.534091             0.531250
1   Precision (1)       0.554404             0.528736
2      Recall (1)       0.578378             0.994595
3    F1-Score (1)       0.566138             0.690432
4  Precision (-1)       0.509434             0.750000
5     Recall (-1)       0.485030             0.017964
6   F1-Score (-1)       0.496933             0.035088

######## Conclusion ########
Random Forest Classifier outperforms Logistic Regression in terms of accuracy.

Detailed Performance Metrics:
           Metric  Random Forest  Logistic Regression
0        Accuracy       0.534091             0.531250
1   Precision (1)       0.554404             0.528736
2      Recall (1)       0.578378             0.994595
3    F1-Score (1)       0.566138             0.690432
4  Precision (-1)       0.509434             0.750000
5     Recall (-1)       0.485030             0.017964
6   F1-Score (-1)       0.496

In [None]:
######## Conclusion ########

#The comparison between Logistic Regression and Random Forest Classifier indicates that the Random Forest Classifier performed slightly better than the Logistic Regression model in terms of accuracy and overall predictive capability.
#The confusion matrix for the Random Forest Classifier shows that it was able to correctly identify more buy and sell signals, with precision values of 0.54 for buy signals (1) and 0.49 for sell signals (-1). The recall for Random Forest was 0.58 for buy signals (1) and 0.46 for sell signals (-1). This leads to F1-scores of 0.56 for buy signals (1) and 0.47 for sell signals (-1).
#In contrast, the Logistic Regression model exhibited higher precision for sell signals (-1) at 0.75 but struggled significantly with recall, achieving only 0.02, which suggests it was less effective at identifying sell signals compared to Random Forest.
#This performance suggests that while Logistic Regression may excel in precision for certain predictions, the Random Forest algorithm, which captures non-linear relationships and interactions between features, is better suited for predicting buy/sell signals based on historical stock prices.
#The implementation of this trading strategy using Random Forest can potentially aid in decision-making for stock purchases and improve overall trading performance.
