In [1]:
import os
from dotenv import load_dotenv

In [2]:
api_key = os.getenv('API_KEY')

In [3]:
import pandas as pd
import numpy as np
import simfin as sf
from simfin.names import *

# Set SimFin data directory
sf.set_api_key(api_key)

sf.set_data_dir('/Users/ayushsingh/Desktop/MBD/Python 2/Group Assignment/simfin_data/')

# Load datasets
df_companies = sf.load_companies(market='us')
df_share_prices = sf.load_shareprices(market='us', variant='daily')




Dataset "us-companies" on disk (0 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (0 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,
  df = pd.read_csv(path, sep=';', header=0,


Done!


In [4]:
# Resetting the index and converting columns to lower case

df_share_prices.reset_index(inplace=True)
df_companies.reset_index(inplace=True)
df_share_prices.columns = df_share_prices.columns.str.lower()
df_companies.columns = df_companies.columns.str.lower()



In [5]:

df_share_prices['date'] = pd.to_datetime(df_share_prices['date'], errors='coerce')

# Drop duplicate rows
df_companies.drop_duplicates(inplace=True)
df_share_prices.drop_duplicates(inplace=True)

# Handle missing values
df_companies.fillna(method='ffill', inplace=True)  # Forward fill missing company data
df_share_prices.fillna(method='ffill', inplace=True)  # Forward fill stock prices

# Remove rows where essential data is missing
df_share_prices.dropna(subset=['close'], inplace=True)


  df_companies.fillna(method='ffill', inplace=True)  # Forward fill missing company data
  df_share_prices.fillna(method='ffill', inplace=True)  # Forward fill stock prices


In [8]:
Q1 = df_share_prices['close'].quantile(0.25)
Q3 = df_share_prices['close'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_share_prices = df_share_prices[(df_share_prices['close'] >= lower_bound) & (df_share_prices['close'] <= upper_bound)]

# Feature Engineering - Create lag and moving average features
df_share_prices['prev_close'] = df_share_prices.groupby('ticker')['close'].shift(1)
df_share_prices['ma7'] = df_share_prices.groupby('ticker')['close'].transform(lambda x: x.rolling(7).mean())
df_share_prices['ma30'] = df_share_prices.groupby('ticker')['close'].transform(lambda x: x.rolling(30).mean())

# Merge company details with share prices
df_cleaned = df_share_prices.merge(df_companies, on='ticker', how='left')

df_cleaned.to_csv('cleaned_stock_data.csv', index=False)
print("✅ Data saved as 'cleaned_stock_data.csv' instead of Excel due to size limitations.")


✅ Data saved as 'cleaned_stock_data.csv' instead of Excel due to size limitations.


In [39]:
df_cleaned

Unnamed: 0,ticker,date,simfinid_x,open,high,low,close,adj. close,volume,dividend,...,simfinid_y,company name,industryid,isin,end of financial year (month),number employees,business summary,market,cik,main currency
0,A,2019-04-11,45846,81.88,81.92,80.89,81.08,77.87,1071479,,...,45846,AGILENT TECHNOLOGIES INC,106001.0,US00846U1016,10.0,16400.0,Agilent Technologies Inc is engaged in life sc...,us,1090872.0,USD
1,A,2019-04-12,45846,81.43,82.06,80.90,80.98,77.77,1249295,,...,45846,AGILENT TECHNOLOGIES INC,106001.0,US00846U1016,10.0,16400.0,Agilent Technologies Inc is engaged in life sc...,us,1090872.0,USD
2,A,2019-04-15,45846,81.00,81.13,79.91,80.40,77.22,1627268,,...,45846,AGILENT TECHNOLOGIES INC,106001.0,US00846U1016,10.0,16400.0,Agilent Technologies Inc is engaged in life sc...,us,1090872.0,USD
3,A,2019-04-16,45846,80.82,80.96,77.19,77.55,74.48,3441597,,...,45846,AGILENT TECHNOLOGIES INC,106001.0,US00846U1016,10.0,16400.0,Agilent Technologies Inc is engaged in life sc...,us,1090872.0,USD
4,A,2019-04-17,45846,78.15,78.32,74.46,75.43,72.44,4471971,,...,45846,AGILENT TECHNOLOGIES INC,106001.0,US00846U1016,10.0,16400.0,Agilent Technologies Inc is engaged in life sc...,us,1090872.0,USD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5023036,ZYXI,2024-03-14,171401,12.69,12.73,12.18,12.19,12.19,313909,0.1,...,7962794,"BurgerFi International, Inc.",106004.0,US98986M1036,12.0,768.0,"Zynex, Inc. engages in the design, manufacture...",us,1705873.0,USD
5023037,ZYXI,2024-03-14,171401,12.69,12.73,12.18,12.19,12.19,313909,0.1,...,689587,"BurgerFi International, Inc.",106004.0,US98986M1036,12.0,768.0,"Zynex, Inc. engages in the design, manufacture...",us,1705873.0,USD
5023038,ZYXI,2024-03-14,171401,12.69,12.73,12.18,12.19,12.19,313909,0.1,...,6480955,"BurgerFi International, Inc.",106004.0,US98986M1036,12.0,768.0,"Zynex, Inc. engages in the design, manufacture...",us,1906324.0,USD
5023039,ZYXI,2024-03-14,171401,12.69,12.73,12.18,12.19,12.19,313909,0.1,...,6481013,"BurgerFi International, Inc.",106004.0,US98986M1036,12.0,768.0,"Zynex, Inc. engages in the design, manufacture...",us,1130464.0,USD


In [12]:
df_cleaned.columns

Index(['ticker', 'date', 'simfinid_x', 'open', 'high', 'low', 'close',
       'adj. close', 'volume', 'dividend', 'shares outstanding', 'prev_close',
       'ma7', 'ma30', 'simfinid_y', 'company name', 'industryid', 'isin',
       'end of financial year (month)', 'number employees', 'business summary',
       'market', 'cik', 'main currency'],
      dtype='object')

In [30]:
df_google = df_cleaned[df_cleaned['ticker'] == 'GOOG'].copy()
# Sort by date
df_google = df_google.sort_values(by='date')


In [32]:
# Create target variable: 1 if next day's close price is higher, else 0
df_google['trend'] = (df_google['close'].shift(-1) > df_google['close']).astype(int)

# Drop last row (it has no future data for prediction)
df_google = df_google[:-1]

# Feature Engineering: Create useful indicators
df_google['daily_return'] = df_google['close'].pct_change()  # Daily return
df_google['volatility'] = df_google['close'].rolling(7).std()  # 7-day Volatility
df_google['ma7'] = df_google['close'].rolling(7).mean()  # 7-day Moving Average
df_google['ma30'] = df_google['close'].rolling(30).mean()  # 30-day Moving Average

# Drop NaN values (from rolling window calculations)
df_google.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_google['daily_return'] = df_google['close'].pct_change()  # Daily return
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_google['volatility'] = df_google['close'].rolling(7).std()  # 7-day Volatility
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_google['ma7'] = df_google['close'].rolling(7

In [33]:
from sklearn.model_selection import train_test_split

# Select Features & Target
features = ['close', 'daily_return', 'volatility', 'ma7', 'ma30']
X = df_google[features]
y = df_google['trend']

# Split data into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [None, 10, 20],  # Tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]  # Minimum leaf size
}

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Grid Search
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

print("✅ Best Model Parameters:", grid_search.best_params_)


✅ Best Model Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


In [37]:
from sklearn.metrics import accuracy_score, classification_report

# Make Predictions
y_pred = best_model.predict(X_test)

# Check Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Optimized Model Accuracy: {accuracy:.2f}")

# Show Classification Report
print(classification_report(y_test, y_pred))



✅ Optimized Model Accuracy: 0.66
              precision    recall  f1-score   support

           0       0.69      0.35      0.47        31
           1       0.66      0.88      0.75        43

    accuracy                           0.66        74
   macro avg       0.67      0.62      0.61        74
weighted avg       0.67      0.66      0.63        74



In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load cleaned stock data
df = pd.read_csv('cleaned_stock_data.csv')

# 🔹 User Input: Select a Company
company_ticker = input("Enter the company ticker (e.g., GOOGL, AAPL, MSFT): ").strip().upper()

# 🔹 Check if the company exists
if company_ticker not in df['ticker'].unique():
    print(f"❌ Error: Company '{company_ticker}' not found in dataset.")
    exit()

# Filter for selected company
df_company = df[df['ticker'] == company_ticker].copy()

# Convert 'date' to datetime and sort by date
df_company['date'] = pd.to_datetime(df_company['date'])
df_company = df_company.sort_values(by='date')

# 🔹 Create Target Variable: Trend Prediction (1 = Up, 0 = Down)
df_company['trend'] = (df_company['close'].shift(-1) > df_company['close']).astype(int)

# Drop last row (no future data)
df_company = df_company[:-1]

# 🔹 Feature Engineering
df_company['daily_return'] = df_company['close'].pct_change()
df_company['volatility'] = df_company['close'].rolling(7).std()
df_company['ma7'] = df_company['close'].rolling(7).mean()
df_company['ma30'] = df_company['close'].rolling(30).mean()
df_company.dropna(inplace=True)  # Remove NaN values

# 🔹 Prepare Training Data
features = ['close', 'daily_return', 'volatility', 'ma7', 'ma30']
X = df_company[features]
y = df_company['trend']

# Split into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get Best Model
best_model = grid_search.best_estimator_
print(f"✅ Best Parameters: {grid_search.best_params_}")

# Evaluate Model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# 🔹 Predict Next Day's Trend
latest_data = X.iloc[[-1]]  # Get the most recent day’s features
prediction = best_model.predict(latest_data)

# Show Prediction
trend_prediction = "Up 📈" if prediction[0] == 1 else "Down 📉"
print(f"📊 Prediction for {company_ticker} on the next trading day: {trend_prediction}")

# Save Model
joblib.dump(best_model, f'{company_ticker}_trend_model.pkl')
print(f"✅ Model saved as '{company_ticker}_trend_model.pkl'")


✅ Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
✅ Model Accuracy: 0.66
              precision    recall  f1-score   support

           0       0.69      0.35      0.47        31
           1       0.66      0.88      0.75        43

    accuracy                           0.66        74
   macro avg       0.67      0.62      0.61        74
weighted avg       0.67      0.66      0.63        74

📊 Prediction for GOOG on the next trading day: Up 📈
✅ Model saved as 'GOOG_trend_model.pkl'
