**Bushra Hoteit**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
USE PROJECT 2 & SAVE IT AS CLEANED DATA

In [None]:
try:
  if not os.path.exists("historical_stocks.csv") or not os.path.exists("historical_stock_prices.csv"):
    raise FileNotFoundError("CSV files not found. Please ensure it is in your working directory.")
  else:
    stocks=pd.read_csv("historical_stocks.csv", encoding='utf-8')
    stock_prices=pd.read_csv("historical_stock_prices.csv", encoding='utf-8')
  if stocks.empty or stock_prices.empty:
    raise ValueError("One or both of the files loaded are empty. Please check the data.")
  print("Successfully loaded both datasets!")

except Exception as e:
  print(f"Error loading data: {str(e)}")
  raise


In [None]:
stock_prices['date'] = pd.to_datetime(stock_prices['date'])
data = pd.merge(stocks, stock_prices, on='ticker')
data.set_index('date', inplace=True)
data.sort_index(inplace=True)
data.head()

**1. Feature Engineering with Technical Indicators**

***Moving Average Convergence Divergence (MACD)***

In [None]:
def EMA(data, period =12, column='close'):
  return data[column].ewm(span = period, adjust = False).mean()

data['EMA12'] = EMA(data, period = 12)
data['EMA26'] = EMA(data, period = 26)
data['MACD'] = data['EMA12'] - data['EMA26']
data['Signal_Line'] = EMA(data, period = 9, columns='MACD')


data['Buy_Signal_MACD'] = data['MACD'] > data['Signal_Line']
data['Sell_Signal_MACD'] = data['MACD'] < data['Signal_Line']


***Relative Strength Index (RSI)***

In [None]:
def RSI(data, period = 14):
  delta = data['close'].diff()
  gain = (delta.where(delta > 0, 0).ewm(span = period, adjust = False).mean())
  loss = (-delta.where(delta < 0, 0).ewm(span = period, adjust = False).mean())
  RS = gain / loss
  return (100 - (100 / (1 + RS)))

data['RSI'] = RSI(data)

In [None]:
data['Buy_Signal_RSI'] = data['RSI'] < 30
data['Sell_Signal_RSI'] = data['RSI'] > 70
data.head()

In [None]:
data['Signal'] = np.where(data['Buy_Signal_MACD'] & data['Buy_Signal_RSI'], 'Buy',
                          np.where(data['Sell_Signal_MACD'] & data['Sell_Signal_RSI'], 'Sell', 'Hold'))
data.head()

**2. Data Preparation and Splitting**

***Integrate the indicators and the signals into the main dataset. Use the signals computed above as ground-truth labels for the dataset. Split the data into training and testing sets.***

In [None]:
data.fillna(0, inplace=True)

features = ['RSI', 'MACD', 'volume', 'rolling_avg', 'volatility']
X_train, X_test, y_train, y_test = train_test_split(data[features], data['Signal'], test_size = 0.2, random_state=42)

**3. Model Building and Validation**

***Implement 1-Logistic Regression, 2-Random Forests, 3-Support Vector Machines (SVM). Train the models on the training set and validate their performance. Training them can be time-consuming, depending on your computer's processing power.***

In [None]:
# Logistic regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

In [None]:
# Random Forests
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [None]:
# SVM
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

**4. Model Evaluation and Optimization**

***Evaluate the models on the test set. Optimize the models based on evaluation metrics and adjust hyperparameters as needed to improve performance. Use cross-validation where applicable to ensure the robustness of the evaluation***

In [None]:
def evaluate_model(predictions, actual):
  accuracy = accuracy_score(actual, predictions)
  return accuracy

lr_metrics = evaluate_model(lr_pred, y_test)
rf_metrics = evaluate_model(rf_pred, y_test)
svm_metrics = evaluate_model(svm_pred, y_test)


**5. Reporting and Documentation**