<a href="https://colab.research.google.com/github/cepdnaclk/e18-6sp-Finding-the-Outliers-Group20/blob/main/6sp_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import libraries

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
# Step 1: Data Collection
data = pd.read_csv('stock_data.csv')


In [None]:
# Step 2: Data Preprocessing
# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
# Step 3: Feature Engineering
# Calculate Moving Average
window = 5  # Choose a suitable window size for the moving average
data['MovingAverage'] = data['Price'].rolling(window=window).mean()

In [None]:
# Calculate Price Volatility
window = 10  # Choose a suitable window size for price volatility
data['PriceVolatility'] = data['Price'].rolling(window=window).std()

In [None]:
# Calculate Relative Strength Index (RSI)
window = 14  # Choose a suitable window size for RSI
delta = data['Price'].diff()
gain = delta.mask(delta < 0, 0)
loss = -delta.mask(delta > 0, 0)
average_gain = gain.rolling(window=window).mean()
average_loss = loss.rolling(window=window).mean()
rs = average_gain / average_loss
data['RSI'] = 100 - (100 / (1 + rs))


In [None]:
# Step 4: Define Outliers
threshold = 2  # Set a threshold for Z-score (you can adjust this value as needed)


In [None]:
# Calculate Z-scores for each feature
data['Zscore_MovingAverage'] = (data['MovingAverage'] - data['MovingAverage'].mean()) / data['MovingAverage'].std()
data['Zscore_PriceVolatility'] = (data['PriceVolatility'] - data['PriceVolatility'].mean()) / data['PriceVolatility'].std()
data['Zscore_RSI'] = (data['RSI'] - data['RSI'].mean()) / data['RSI'].std()

# Identify outliers based on Z-scores
data['DeviationFromCrowd'] = np.where((abs(data['Zscore_MovingAverage']) > threshold) |
                                      (abs(data['Zscore_PriceVolatility']) > threshold) |
                                      (abs(data['Zscore_RSI']) > threshold), 1, 0)


In [None]:
# Step 5: Model Training
# Define the features and target variable
features = ['MovingAverage', 'PriceVolatility', 'RSI']
target = 'DeviationFromCrowd'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

In [None]:
# Step 6: Model Evaluation
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Step 7: Model Deployment and Prediction
# Load new, unseen data
new_data = pd.read_csv('new_stock_data.csv')

# Apply the same feature engineering steps as above to the new data
new_data['Date'] = pd.to_datetime(new_data['Date'])
new_data['MovingAverage'] = new_data['Price'].rolling(window=window).mean()
new_data['PriceVolatility'] = new_data['Price'].rolling(window=window).std()
new_data['RSI'] = 100 - (100 / (1 + rs))

# Make predictions on the new data
new_predictions = rf_classifier.predict(new_data[features])
print("New Predictions:", new_predictions)