## Libraries Used

In [None]:
# System / OS Handling (Standard Library)
import os

# Data Handling
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Model Evaluation
from sklearn.metrics import (
    accuracy_score, 
    precision_score,
    recall_score,
    f1_score,
    classification_report, 
    confusion_matrix, 
    ConfusionMatrixDisplay,
    roc_curve,
    roc_auc_score
)

# Display Formatting
from IPython.display import display, HTML


## Dataset extraction and organization
### Load and combine all datasets
### Data Cleaning and Preparation


In [None]:
# Ensure data folder exists
os.makedirs("data", exist_ok=True)

# Load datasets
prices_df = pd.read_csv("data/prices-split-adjusted.csv")
securities_df = pd.read_csv("data/securities.csv")

# Display first few rows (optional in scripts, great for notebooks)
prices_df.head(), securities_df.head()

# Filter IT sector companies
it_companies = securities_df[securities_df['GICS Sector'] == 'Information Technology']

# Select 10 companies (including AAPL)
selected_companies = ['AAPL', 'MSFT', 'ORCL', 'IBM', 'INTC', 'CSCO', 'HPQ', 'ADBE', 'NVDA', 'TXN']
print(f"Selected Companies: {', '.join(selected_companies)}")

# Filter prices
# Keeping only rows for the selected IT sector companies from the full price dataset.
filtered_prices = prices_df[prices_df['symbol'].isin(selected_companies)].copy()

# Converting the 'date' column from string format to datetime objects.
filtered_prices['date'] = pd.to_datetime(filtered_prices['date'])

# Sorting the data by 'symbol' and 'date' to ensure proper chronological order within each stock.
filtered_prices = filtered_prices.sort_values(by=['symbol', 'date'])

# Checking how many fully duplicated rows are in the dataset
print("Number of duplicated rows:", filtered_prices.duplicated().sum())

# Dropping rows with missing values and duplicates.
filtered_prices = filtered_prices.dropna().drop_duplicates()

# Keeping only the necessary columns
filtered_prices = filtered_prices[['date', 'symbol', 'open', 'close', 'volume']]

# Assigning the cleaned and filtered dataset to a new variable 'df' for convenience.
df= filtered_prices

# Displaying the first few rows of the dataset to visually inspect the structure and confirm the data looks correct.
print("=====================================================")
print("Preview of cleaned dataset:")
display(df.head())


# Generating a statistical summary of the dataset's numerical columns.
print("=====================================================")
print("Statistical summary of the dataset:")
df.describe()







## Exploratory Data Analysis After feature engineering


### Distribution Comparison of Closing Prices and Trading Volumes (AAPL vs. IT Sector)

In [None]:
# Filter Model 1 (AAPL only) and Model 2 (all IT companies)
model_1_data = df[df['symbol'] == 'AAPL']
model_2_data = df  # already contains all 10 companies

# ---- Closing Price Distribution ----
plt.figure(figsize=(12, 5))
sns.histplot(model_1_data['close'], bins=50, label='AAPL', kde=True, color='blue')
sns.histplot(model_2_data['close'], bins=50, label='All IT Companies', kde=True, color='red', alpha=0.5)
plt.xlabel("Closing Price")
plt.ylabel("Frequency")
plt.title("Distribution of Closing Prices (AAPL vs IT Sector)")
plt.legend()
plt.show()

# ---- Trading Volume Distribution ----
plt.figure(figsize=(12, 5))
sns.histplot(model_1_data['volume'], bins=50, label='AAPL', kde=True, color='blue')
sns.histplot(model_2_data['volume'], bins=50, label='All IT Companies', kde=True, color='red', alpha=0.5)
plt.xlabel("Trading Volume")
plt.ylabel("Frequency")
plt.title("Distribution of Trading Volume (AAPL vs IT Sector)")
plt.legend()
plt.show()


### Closing Price Trends Over Time (AAPL vs IT Sector)


In [None]:
plt.figure(figsize=(12, 6))

# AAPL only
sns.lineplot(data=model_1_data, x='date', y='close', label='AAPL', color='blue')

# All other IT companies (excluding AAPL to avoid duplicate line)
other_companies = model_2_data[model_2_data['symbol'] != 'AAPL']
sns.lineplot(data=other_companies, x='date', y='close', hue='symbol', alpha=0.6)

plt.xlabel("Date")
plt.ylabel("Closing Price")
plt.title("Stock Price Trends Over Time (AAPL vs IT Sector)")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()


### Trading Volume Trends Over Time (AAPL vs IT Sector)


In [None]:
plt.figure(figsize=(12, 6))

# AAPL volume
sns.lineplot(data=model_1_data, x='date', y='volume', label='AAPL', color='blue')

# Volume for other IT companies
sns.lineplot(data=other_companies, x='date', y='volume', hue='symbol', alpha=0.6)

plt.xlabel("Date")
plt.ylabel("Trading Volume")
plt.title("Trading Volume Trends Over Time (AAPL vs IT Sector)")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


### Heatmap of Pairwise Correlations Between Stock Closing Prices (IT Sector)

In [None]:
# Pivot the data: each column is one company, rows are dates, values are closing prices
pivot_data = df.pivot(index='date', columns='symbol', values='close')

# Compute correlation matrix between the companies
correlation_matrix = pivot_data.corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Matrix of Stock Closing Prices (IT Sector)")
plt.tight_layout()
plt.show()

### Target Variable Distribution: Price Increase vs. No Increase


### AAPL Closing Price vs. 10-Day Rolling Average

## Feature Engineering

In [None]:
# We initialy decided to drop two companies — IBM and HPQ — since they showed negative correlations, but the results were not significant.
# Exclude IBM and HPQ
# df = df[~df['symbol'].isin(['IBM', 'HPQ'])]

#Sorting data by company and date( we did it in preprocessing, bbut just to be sure since it is a crucial step)
df= df.sort_values(by=['symbol', 'date'])

# Creating a daily return feature
df['daily_change'] = (df['close'] - df['open']) / df['open'] # measuring the percentage change in the stock price daily for each index

# creating the volum change feature relative to previous day
df['volume_change'] = df.groupby('symbol')['volume'].diff() # Measuring how the volume changed from the previous day.

# Creating a rolling average of the closing price over the past 10 days
df['rolling_close_mean'] = df.groupby('symbol')['close'].transform(lambda x: x.rolling(window=10).mean()) # Each row gets a new value, the 10-day average at that row.

# creaing a rolling average of the volume over the past 10 days
df['rolling_volume_mean'] = df.groupby('symbol')['volume'].transform(lambda x: x.rolling(window=10).mean()) # Each row gets a new value, the 10-day average at that row.

# creating the target variable binary to see the next day's price is higher than today's price or not
df['next_close'] = df.groupby('symbol')['close'].shift(-1) 
df['target'] = (df['next_close'] > df['close']).astype(int) # if row['next_close'] > row['close']: return 1 else: return 0

# Dropping rows with missing values caused by rolling or shifting operations
df = df.dropna() # first 9 rows becuase of roling windows and the last because of shifting operations

# Checking the final result
display(df[[ 'daily_change', 'volume_change', 'rolling_close_mean', 'rolling_volume_mean', 'target']].head())





### EDA Feature Engineering

##### Rolling Volume vs. Raw Volume for 1 Company

In [None]:
#Comparing the trend of the volume of AAPL stock with its 10-day rolling volume mean
sample = df[df['symbol'] == 'AAPL'].set_index('date') # selecting the data for AAPL stock

# size of the plot
plt.figure(figsize=(12, 6))

# Plotting the 'volume' and 'rolling_volume_mean' columns
plt.plot(sample['volume'], label='Volume')
plt.plot(sample['rolling_volume_mean'], label='10-Day Rolling Average')

# Adding a title and labels
plt.title('AAPL Volume vs 10-Day Rolling Volume')
plt.ylabel('Volume')

# Adding a grid and legend
plt.grid(True)
plt.legend()

plt.show()


## Train-Test Split

In [None]:
# from sklearn.model_selection import train_test_split
# We don't use above library Because:
    # this is time series data (stock prices), maintaining chronological order is critical to avoid data leakage.
    # Instead, we manually split the data using iloc to preserve the time-based structure.


# Filteting Model 1 ( AAPL only ) and Model 2 ( AAPL + others )
# using copy method to avoid unintended changes
model_1_data = df[df['symbol'] == 'AAPL'].copy()
model_2_data = df.copy()

#Dropping unnecessary columns 
drop_cols = ['symbol', 'next_close', 'date', 'open', 'close', 'volume'] # these colums are not featyres for our models
model_1_data = model_1_data.drop(columns=drop_cols)
model_2_data = model_2_data.drop(columns=drop_cols)

# Chronologically splitting data into train and test sets( 80% train, 20% test )
# We don't use sklearn's train_test_split because we want to keep the chronological order because it shuffles the data

#Model 1 (APPL Only)
model_1_split_index = int(len(model_1_data) * 0.8) # 80% index for train set
model_1_train = model_1_data.iloc[:model_1_split_index] # 80% of rows for train set
model_1_test = model_1_data.iloc[model_1_split_index:] # 20% of rows for test set

#Model 2 (APPL + Others)
model_2_split_index = int(len(model_2_data) * 0.8) # 80% index for train set
model_2_train = model_2_data.iloc[:model_2_split_index] # 80% of rows for train set
model_2_test = model_2_data.iloc[model_2_split_index:] # 20% of rows for test set

# separating features (x) and target (y)
# The model will learn from features (x) to predict the binary target (y)

# Model 1
x__train_1 = model_1_train.drop(columns='target') # Features input for training (daily_change, volume_change, rolling_close_mean, rolling_volume_mean are independent variables)
y__train_1 = model_1_train['target'] # output for training (dependent variable (0 or 1))
x__test_1 = model_1_test.drop(columns='target')
y__test_1 = model_1_test['target']

# Model 2
x__train_2 = model_2_train.drop(columns='target') # Features input for training (daily_change, volume_change, rolling_close_mean, rolling_volume_mean are independent variables)
y__train_2 = model_2_train['target']
x__test_2 = model_2_test.drop(columns='target')
y__test_2 = model_2_test['target']

# The shapes of eac dataset
print('Model 1 ( AAPL only ):')
print(f'Training set: {x__train_1.shape}, Testing set: {x__test_1.shape}')
print('Model 2 ( AAPL + Others ):')
print(f'Training set: {x__train_2.shape}, Testing set: {x__test_2.shape}')

## Models Implementation

In [None]:
# Model 1: AAPL only
scaler1 = StandardScaler()
x_train_1_scaled = scaler1.fit_transform(x__train_1)
x_test_1_scaled = scaler1.transform(x__test_1)

clf1 = LogisticRegression(max_iter=1000)
clf1.fit(x_train_1_scaled, y__train_1)
y_prediction_1 = clf1.predict(x_test_1_scaled)

print(accuracy_score(y__test_1, y_prediction_1))
print(classification_report(y__test_1, y_prediction_1))


# Model 2: 10 IT sector companies
scaler2 = StandardScaler()
x_train_2_scaled = scaler2.fit_transform(x__train_2)
x_test_2_scaled = scaler2.transform(x__test_2)

clf2 = LogisticRegression(max_iter=1000)
clf2.fit(x_train_2_scaled, y__train_2)
y_prediction_2 = clf2.predict(x_test_2_scaled)

print(accuracy_score(y__test_2, y_prediction_2))
print(classification_report(y__test_2, y_prediction_2))

### Models Evaluation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    # Overall percentage of correct predictions
    print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))

    # Precision when the model predicts 'Up' (class 1)
    print("Precision (Up):", round(precision_score(y_true, y_pred), 4))

    # Precision when the model predicts 'Down' (class 0)
    print("Precision (Down):", round(precision_score(y_true, y_pred, pos_label=0), 4))

    # Average of precision for both 'Up' and 'Down' (macro = unweighted average)
    print("Precision (Macro Average):", round(precision_score(y_true, y_pred, average='macro'), 4))

    # Recall for 'Up' class: how well model catches actual Up days
    print("Recall (Up):", round(recall_score(y_true, y_pred), 4))

    # Recall for 'Down' class: how well model catches actual Down days
    print("Recall (Down):", round(recall_score(y_true, y_pred, pos_label=0), 4))

    # Average of recall for both classes
    print("Recall (Macro Average):", round(recall_score(y_true, y_pred, average='macro'), 4))

    # F1 Score for 'Up': balance between precision and recall
    print("F1 Score (Up):", round(f1_score(y_true, y_pred), 4))

    # F1 Score for 'Down'
    print("F1 Score (Down):", round(f1_score(y_true, y_pred, pos_label=0), 4))

    # Average F1 score across both classes
    print("F1 Score (Macro Average):", round(f1_score(y_true, y_pred, average='macro'), 4))

# Evaluate both models
evaluate_model(y__test_1, y_prediction_1, "Model 1 (AAPL Only)")
evaluate_model(y__test_2, y_prediction_2, "Model 2 (AAPL + Others)")

In [None]:
# Simulated accuracy scores based on previous confusion matrices
accuracy_model1 = accuracy_score(y__test_1, y_prediction_1)
accuracy_model2 = accuracy_score(y__test_2, y_prediction_2)

# Data for the bar plot
model_names = ['Model 1 (AAPL Only)', 'Model 2 (AAPL + Others)']
accuracies = [accuracy_model1, accuracy_model2]

# Create bar chart
plt.figure(figsize=(7, 5))
bars = plt.bar(model_names, accuracies, color=['steelblue', 'seagreen'])

# Add text labels with percentage above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 0.01, f"{height:.2%}", ha='center', fontsize=12)

# Set chart details
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison Between Model 1 and Model 2')
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()


In [None]:
# Get predicted probabilities (no re-fitting needed!)
proba_1 = clf1.predict_proba(x_test_1_scaled)[:, 1]
proba_2 = clf2.predict_proba(x_test_2_scaled)[:, 1]

# Calculate ROC curve data
fpr1, tpr1, _ = roc_curve(y__test_1, proba_1)
fpr2, tpr2, _ = roc_curve(y__test_2, proba_2)

# Calculate AUCs
auc_1 = round(roc_auc_score(y__test_1, proba_1), 4)
auc_2 = round(roc_auc_score(y__test_2, proba_2), 4)

# Print AUCs
print("AUC for Model 1 (AAPL):", auc_1)
print("AUC for Model 2 (All IT):", auc_2)

# Plot ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr1, tpr1, label=f"Model 1 (AAPL) - AUC = {auc_1}")
plt.plot(fpr2, tpr2, label=f"Model 2 (All IT) - AUC = {auc_2}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.grid(True)
plt.show()




In [None]:
# Compute confusion matrices
cm_model1 = confusion_matrix(y__test_1, y_prediction_1)
cm_model2 = confusion_matrix(y__test_2, y_prediction_2)

# Create a single figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot Model 1 Confusion Matrix
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_model1, display_labels=[0, 1])
disp1.plot(ax=axes[0], cmap='Blues', colorbar=False)
axes[0].set_title('Model 1: Confusion Matrix (Raw Counts)')
axes[0].set_xlabel('Predicted Label')
axes[0].set_ylabel('True Label')

# Plot Model 2 Confusion Matrix
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_model2, display_labels=[0, 1])
disp2.plot(ax=axes[1], cmap='Greens', colorbar=False)
axes[1].set_title('Model 2: Confusion Matrix (Raw Counts)')
axes[1].set_xlabel('Predicted Label')
axes[1].set_ylabel('True Label')

# Adjust layout
plt.tight_layout()
plt.show()
