Bushra Hoteit

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import os



In [9]:
try:
  if not os.path.exists("cleaned_stock_data_new.csv"):
    raise FileNotFoundError("CSV file not found. Please ensure it is in your working directory.")
  else:
    df=pd.read_csv("cleaned_stock_data_new.csv", encoding='utf-8')
  if df.empty:
    raise ValueError("The file loaded is empty. Please check the data.")
  print("Successfully loaded dataset!")

except Exception as e:
  print(f"Error loading data: {str(e)}")
  raise


Successfully loaded dataset!


In [13]:
df.tail()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,exchange,name,sector,industry,decade,rolling_avg_close,volatility,sector_encoded,industry_encoded,exchange_encoded
20973884,INST,38.200001,39.099998,39.099998,38.200001,39.150002,240400,NYSE,"INSTRUCTURE, INC.",TECHNOLOGY,COMPUTER SOFTWARE: PREPACKAGED SOFTWARE,2010,34.3895,19.99155,10,30,1
20973885,SQQQ,12.12,11.89,11.89,11.87,12.12,13058000,NASDAQ,PROSHARES ULTRAPRO SHORT QQQ,Unknown,Unknown,2010,32.2495,20.020159,12,133,0
20973886,PBFX,21.799999,21.6,21.6,21.5,21.85,87000,NYSE,PBF LOGISTICS LP,ENERGY,NATURAL GAS DISTRIBUTION,2010,31.7345,19.908982,5,85,1
20973887,POPE,73.0,63.049998,73.0,73.0,73.0,1100,NASDAQ,POPE RESOURCES,CONSUMER NON-DURABLES,ENVIRONMENTAL SERVICES,2010,34.08,19.971487,3,48,0
20973888,NZF,14.6,14.69,14.69,14.59,14.69,180900,NYSE,NUVEEN MUNICIPAL CREDIT INCOME FUND,Unknown,Unknown,2010,31.662,20.024798,12,133,1


1. Feature Engineering with Technical Indicators

Moving Average Convergence Divergence (MACD)

In [15]:
# Function to calculate exponential moving average of 'close' column
def EMA(df, period =12, column='close'):
  return df[column].ewm(span = period, adjust = False).mean()

# Calculating the EMA's to get the MACD line & signal line
df['EMA12'] = EMA(df, period = 12)
df['EMA26'] = EMA(df, period = 26)
df['MACD'] = df['EMA12'] - df['EMA26']
df['Signal_Line'] = EMA(df, period = 9, column='MACD')

# Buy & Sell signals
df['Buy_Signal_MACD'] = df['MACD'] > df['Signal_Line']
df['Sell_Signal_MACD'] = df['MACD'] < df['Signal_Line']


In [17]:
# View of the data with the MACD signals

df[['ticker','close','EMA12', 'EMA26', 'MACD', 'Signal_Line', 'Buy_Signal_MACD', 'Sell_Signal_MACD']].head()


Unnamed: 0,ticker,close,EMA12,EMA26,MACD,Signal_Line,Buy_Signal_MACD,Sell_Signal_MACD
0,XOM,1.9375,1.9375,1.9375,0.0,0.0,False,False
1,AA,7.140915,2.738025,2.322938,0.415087,0.083017,True,False
2,BA,0.979424,2.467471,2.223419,0.244053,0.115225,True,False
3,DIS,0.683144,2.192959,2.109324,0.083635,0.108907,False,True
4,PG,1.71875,2.120004,2.080393,0.039611,0.095048,False,True


Relative Strength Index (RSI)

In [19]:
# Function to calculate RSI using the relative strength of price action

def RSI(df, period = 14):
  delta = df['close'].diff()
  gain = (delta.where(delta > 0, 0).ewm(span = period, adjust = False).mean())
  loss = (-delta.where(delta < 0, 0).ewm(span = period, adjust = False).mean())
  RS = gain / loss
  return (100 - (100 / (1 + RS)))

df['RSI'] = RSI(df)

# Buy & Sell signals
df['Buy_Signal_RSI'] = df['RSI'] < 30
df['Sell_Signal_RSI'] = df['RSI'] > 70


In [21]:
# View of the data with the RSI signals
df[['ticker','close','RSI', 'Buy_Signal_RSI', 'Sell_Signal_RSI']].tail()

Unnamed: 0,ticker,close,RSI,Buy_Signal_RSI,Sell_Signal_RSI
20973884,INST,39.099998,51.48937,False,False
20973885,SQQQ,11.89,44.357179,False,False
20973886,PBFX,21.6,47.359545,False,False
20973887,POPE,63.049998,58.412341,False,False
20973888,NZF,14.69,45.540054,False,False


In [23]:
# Signals based on both MACD & RSI
df['Signal'] = np.where(df['Buy_Signal_MACD'] & df['Buy_Signal_RSI'], 'Buy',
                          np.where(df['Sell_Signal_MACD'] & df['Sell_Signal_RSI'], 'Sell', 'Hold'))
df.tail()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,exchange,name,sector,...,EMA12,EMA26,MACD,Signal_Line,Buy_Signal_MACD,Sell_Signal_MACD,RSI,Buy_Signal_RSI,Sell_Signal_RSI,Signal
20973884,INST,38.200001,39.099998,39.099998,38.200001,39.150002,240400,NYSE,"INSTRUCTURE, INC.",TECHNOLOGY,...,34.428489,32.316856,2.111633,1.408355,True,False,51.48937,False,False,Hold
20973885,SQQQ,12.12,11.89,11.89,11.87,12.12,13058000,NASDAQ,PROSHARES ULTRAPRO SHORT QQQ,Unknown,...,30.961029,30.803756,0.157273,1.158139,False,True,44.357179,False,False,Hold
20973886,PBFX,21.799999,21.6,21.6,21.5,21.85,87000,NYSE,PBF LOGISTICS LP,ENERGY,...,29.520871,30.121996,-0.601125,0.806286,False,True,47.359545,False,False,Hold
20973887,POPE,73.0,63.049998,73.0,73.0,73.0,1100,NASDAQ,POPE RESOURCES,CONSUMER NON-DURABLES,...,34.679198,32.561107,2.118091,1.068647,True,False,58.412341,False,False,Hold
20973888,NZF,14.6,14.69,14.69,14.59,14.69,180900,NYSE,NUVEEN MUNICIPAL CREDIT INCOME FUND,Unknown,...,31.603937,31.237321,0.366615,0.928241,False,True,45.540054,False,False,Hold


2. Data Preparation and Splitting

Integrate the indicators and the signals into the main dataset. Use the signals computed above as ground-truth labels for the dataset. Split the data into training and testing sets.

In [25]:
df.fillna(0, inplace=True)

# Splitting the data into training & testing sets

X = df[['RSI', 'MACD', 'volume', 'rolling_avg_close', 'volatility']]
y = df['Signal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

3. Model Building and Validation

Implement 1-Logistic Regression, 2-Random Forests, 3-Support Vector Machines (SVM). Train the models on the training set and validate their performance. Training them can be time-consuming, depending on your computer's processing power.

In [57]:
# Logistic regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

In [59]:
# Random Forests
rf_model = RandomForestClassifier(max_depth=10, n_estimators=50, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [61]:
# SVM
svm_model = SVC(kernel='rbf', gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

4. Model Evaluation and Optimization

Evaluate the models on the test set. Optimize the models based on evaluation metrics and adjust hyperparameters as needed to improve performance. Use cross-validation where applicable to ensure the robustness of the evaluation

In [63]:
# Function for evaluation metrics

def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    cm = confusion_matrix(actual, predictions)
    cr = classification_report(actual, predictions)
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", cr)
    return accuracy

print("Logistic Regression Evaluation:")
lr_metrics = evaluate_model(lr_pred, y_test)

print("\nRandom Forest Evaluation:")
rf_metrics = evaluate_model(rf_pred, y_test)

print("\nSVM Evaluation:")
svm_metrics = evaluate_model(svm_pred, y_test)


Logistic Regression Evaluation:




Accuracy: 1.0
Confusion Matrix:
 [[4194778]]
Classification Report:
               precision    recall  f1-score   support

        Hold       1.00      1.00      1.00   4194778

    accuracy                           1.00   4194778
   macro avg       1.00      1.00      1.00   4194778
weighted avg       1.00      1.00      1.00   4194778


Random Forest Evaluation:




Accuracy: 1.0
Confusion Matrix:
 [[4194778]]
Classification Report:
               precision    recall  f1-score   support

        Hold       1.00      1.00      1.00   4194778

    accuracy                           1.00   4194778
   macro avg       1.00      1.00      1.00   4194778
weighted avg       1.00      1.00      1.00   4194778


SVM Evaluation:




Accuracy: 1.0
Confusion Matrix:
 [[4194778]]
Classification Report:
               precision    recall  f1-score   support

        Hold       1.00      1.00      1.00   4194778

    accuracy                           1.00   4194778
   macro avg       1.00      1.00      1.00   4194778
weighted avg       1.00      1.00      1.00   4194778



The Signal column is only giving 'Hold' results due to RSI & MACD signals not have the same condition at the same time. 
This is creating errors in the models & giving us an accuracy of 1.
