In [96]:
import os
import numpy as np
import pandas as pd



In [97]:
INPUT_FOLDER = "enrich"
OUTPUT_FOLDER = ""

In [98]:
key = "us_shareproce_joined_companies"

data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv")
data.info(show_counts=True)
data['Date'] = pd.to_datetime(data['Date'])

data.set_index('Date', inplace=True)  # Set Date as the index


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5380108 entries, 0 to 5380107
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Date          5380108 non-null  object 
 1   Industry      5380108 non-null  object 
 2   Sector        5380108 non-null  object 
 3   Ticker        5380108 non-null  object 
 4   Open          5380108 non-null  float64
 5   High          5380108 non-null  float64
 6   Low           5380108 non-null  float64
 7   Close         5380108 non-null  float64
 8   Volume        5380108 non-null  int64  
 9   Dividend      5380108 non-null  float64
 10  Company Name  5380108 non-null  object 
dtypes: float64(5), int64(1), object(5)
memory usage: 451.5+ MB


In [99]:
data.head()


Unnamed: 0_level_0,Industry,Sector,Ticker,Open,High,Low,Close,Volume,Dividend,Company Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-04-11,Industrial Products,Industrials,AAON,29.9,30.27,29.75,30.08,164812,0.0,AAON INC
2019-04-11,REITs,Real Estate,ESRT,15.89,15.94,15.74,15.83,354551,0.0,"Empire State Realty Trust, Inc."
2019-04-11,Banks,Financial Services,HWC,42.81,43.09,42.24,42.59,337095,0.0,HANCOCK WHITNEY CORP
2019-04-11,Banks,Financial Services,IBCP,22.15,22.34,22.05,22.09,45879,0.0,Independent Bank Corporation
2019-04-11,Banks,Financial Services,IBN,11.25,11.25,11.16,11.19,3291052,0.0,ICICI Bank Limited


In [100]:
data['Sector'].unique()

array(['Industrials', 'Real Estate', 'Financial Services', 'Technology',
       'Basic Materials', 'Utilities', 'Business Services', 'Energy',
       'Healthcare', 'Consumer Defensive', 'Consumer Cyclical', 'Other'],
      dtype=object)

In [101]:
data=data[data['Sector']=='Technology']

In [102]:
data=data[data['Volume']>100000000]

In [None]:
len(data['Ticker'].unique())
data['Ticker'].unique()

array(['NVDA', 'AAPL', 'QCOM', 'TWTR', 'SNAP', 'GOOG', 'AMD', 'AVGO',
       'MU', 'CSCO', 'WHEN', 'DPLS', 'HMNY', 'NOK', 'ANET', 'MVIS',
       'HYSR', 'INTC', 'SCKT', 'PINS', 'ADT', 'SUNW', 'PLTR', 'PPSI',
       'AITX', 'BB', 'DDD', 'BSQR', 'SPRT', 'NUAN', 'LEDS', 'BLIN',
       'TROO', 'AEHR', 'ZNGA', 'RCAT', 'PALT', 'ATVI', 'META', 'CASA',
       'RBLX', 'AVYA', 'BEAT', 'AI', 'SQ', 'PANW', 'SMCI', 'MARK', 'SOUN',
       'AXTI'], dtype=object)

In [109]:
data=data[data['Ticker'].isin(['NVDA','AAPL','SNAP','GOOG','AMD','META'])]

In [None]:
df_appl=AAPL

In [85]:
data.isna().sum()

Industry        0
Sector          0
Ticker          0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividend        0
Company Name    0
dtype: int64

In [86]:
# Drop unused columns
unused_columns = ['Industry', 'Sector', 'Company Name', 'Number Employees','Day','Dividend']
data = data.drop(columns=unused_columns, errors='ignore')  # 'errors=ignore' avoids errors if a column is missing


In [87]:
# Load your dataset
df = data.copy()  # Assuming 'data' is your stock dataset

# Ensure 'Date' is a datetime object and sort by date
df = df.sort_values(by='Date')

# Define the target variable (Predict if stock goes UP or DOWN the next day)
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)  # 1 = Up, 0 = Down
df.dropna(inplace=True)  # Remove NaN values


In [88]:
# Moving Averages (Trend Detection)
df['MA_10'] = df['Close'].rolling(window=10).mean()
df['MA_50'] = df['Close'].rolling(window=50).mean()

# Volatility (10-day Standard Deviation)
df['Volatility'] = df['Close'].rolling(window=10).std()

# Relative Strength Index (RSI)
def compute_rsi(series, window=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0).rolling(window=window).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=window).mean()
    rs = gain / (loss + 1e-10)  # Avoid division by zero
    return 100 - (100 / (1 + rs))

df['RSI'] = compute_rsi(df['Close'])

# Bollinger Bands (Upper & Lower Price Ranges)
df['BB_Upper'] = df['MA_10'] + (2 * df['Close'].rolling(10).std())
df['BB_Lower'] = df['MA_10'] - (2 * df['Close'].rolling(10).std())

# Drop NaNs caused by rolling calculations
df.dropna(inplace=True)


In [89]:
features = ['Open', 'High', 'Low', 'Volume', 'MA_10', 'MA_50', 'RSI', 'BB_Upper', 'BB_Lower']
target = 'Target'
X = df[features]
y = df[target]


In [90]:
split = int(len(df) * 0.8)  # 80% Train, 20% Test
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 63, Test size: 16


In [91]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=500,  # Number of trees
    max_depth=6,        # Tree depth
    learning_rate=0.01, # Step size shrinkage
    subsample=0.2,      # Use 80% of data per tree (prevents overfitting)
    colsample_bytree=0.8, # Use 80% of features per tree
    random_state=42
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],  # Validation set
    verbose=100,
)



[0]	validation_0-rmse:0.49712	validation_1-rmse:0.51415
[100]	validation_0-rmse:0.40378	validation_1-rmse:0.45875
[200]	validation_0-rmse:0.35732	validation_1-rmse:0.45964
[300]	validation_0-rmse:0.32490	validation_1-rmse:0.45567
[400]	validation_0-rmse:0.29674	validation_1-rmse:0.45423
[499]	validation_0-rmse:0.27243	validation_1-rmse:0.45005


In [92]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,precision_score, recall_score
import numpy as np
# Make Predictions
y_pred = model.predict(X_test)

# Evaluate Performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("📊 XGBoost Model Performance:")
print(f"✅ Mean Absolute Error: {mae:.4f}")
print(f"✅ Mean Squared Error: {mse:.4f}")
print(f"✅ R² Score: {r2:.4f}")

import numpy as np
from sklearn.metrics import precision_score, recall_score

# Convert actual & predicted stock prices into Up (1) or Down (0)
y_test_binary = (y_test.diff().fillna(0) > 0).astype(int)  # 1 = Up, 0 = Down
y_pred_binary = (np.diff(y_pred, prepend=y_pred[0]) > 0).astype(int)  # 1 = Up, 0 = Down

precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)

print(f"📊 Classification Metrics:")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall: {recall:.4f}")



📊 XGBoost Model Performance:
✅ Mean Absolute Error: 0.4148
✅ Mean Squared Error: 0.2025
✅ R² Score: 0.0572
📊 Classification Metrics:
✅ Precision: 0.3750
✅ Recall: 0.6000


Arima Model

In [93]:
key = "us_shareproce_joined_companies"

data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv")
data.info(show_counts=True)
data['Date'] = pd.to_datetime(data['Date'])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5380108 entries, 0 to 5380107
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Date          5380108 non-null  object 
 1   Industry      5380108 non-null  object 
 2   Sector        5380108 non-null  object 
 3   Ticker        5380108 non-null  object 
 4   Open          5380108 non-null  float64
 5   High          5380108 non-null  float64
 6   Low           5380108 non-null  float64
 7   Close         5380108 non-null  float64
 8   Volume        5380108 non-null  int64  
 9   Dividend      5380108 non-null  float64
 10  Company Name  5380108 non-null  object 
dtypes: float64(5), int64(1), object(5)
memory usage: 451.5+ MB


In [94]:
from sklearn.preprocessing import MinMaxScaler

# Create a scaler
scaler = MinMaxScaler()


df = data.groupby("Date")["Close"].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())


In [95]:
df=pd.DataFrame(df)
df['Close']

0          3.008000e-07
1          1.583000e-07
2          4.259000e-07
3          2.209000e-07
4          1.119000e-07
               ...     
5380103    8.106122e-05
5380104    0.000000e+00
5380105    8.163265e-08
5380106    5.110204e-06
5380107    1.784490e-05
Name: Close, Length: 5380108, dtype: float64