<a href="https://colab.research.google.com/github/bisman-sodhi/FT-Market-Anomaly-Detection/blob/main/FT_Market_Anomaly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Milestone 1

In [95]:
!pip install llama-index
!pip install llama-index-llms-groq



In [97]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Flatten, Conv1D
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.optimizers import Adam
from llama_index.llms.groq import Groq
from llama_index.core.llms import ChatMessage

In [98]:
data = pd.read_csv('data.csv')

In [99]:
def preprocess(date):
  data.rename(columns={'Data': 'Date'}, inplace=True)
  data['Date'] = pd.to_datetime(data['Date'])
  return data

In [100]:
data = preprocess(data)

In [101]:
def normalize(data):
  for col in data.columns:
    if col not in ['Date', 'Y']:
        # Handle potential errors if a column is not numeric
        if pd.api.types.is_numeric_dtype(data[col]):
            scaler = StandardScaler()
            data[col] = scaler.fit_transform(data[[col]])
        else:
            print(f"Column '{col}' is not numeric and cannot be normalized.")

In [102]:
normalize(data)

# Feature Eng

In [103]:
y = data['Y']
X = data.drop(['Date', 'Y'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
feature_importance = rf.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [105]:
importance_df

Unnamed: 0,Feature,Importance
8,VIX,0.379644
19,GTITL10YR,0.142152
30,LF98TRUU,0.032104
18,GTITL30YR,0.031341
39,MXRU,0.02482
0,XAU BGNL,0.022435
1,ECSURPUS,0.020241
5,JPY,0.019861
37,MXJP,0.019198
2,BDIY,0.01784


In [106]:
important_features = importance_df[importance_df['Importance'] > 0.006948]['Feature'].tolist()
important_features

['VIX',
 'GTITL10YR',
 'LF98TRUU',
 'GTITL30YR',
 'MXRU',
 'XAU BGNL',
 'ECSURPUS',
 'JPY',
 'MXJP',
 'BDIY',
 'GBP',
 'USGG30YR',
 'GTGBP30Y',
 'GTJPY2YR',
 'DXY',
 'LG30TRUU',
 'EONIA',
 'GTJPY30YR',
 'GTITL2YR',
 'USGG3M',
 'CRY',
 'MXEU',
 'Cl1',
 'GT10',
 'USGG2YR',
 'MXIN',
 'GTGBP2Y',
 'GTGBP20Y',
 'GTDEM2Y',
 'US0001M',
 'MXBR',
 'LP01TREU',
 'MXCN']

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X[important_features], y, test_size=0.2, random_state=42)

In [109]:
X_train.columns

Index(['VIX', 'GTITL10YR', 'LF98TRUU', 'GTITL30YR', 'MXRU', 'XAU BGNL',
       'ECSURPUS', 'JPY', 'MXJP', 'BDIY', 'GBP', 'USGG30YR', 'GTGBP30Y',
       'GTJPY2YR', 'DXY', 'LG30TRUU', 'EONIA', 'GTJPY30YR', 'GTITL2YR',
       'USGG3M', 'CRY', 'MXEU', 'Cl1', 'GT10', 'USGG2YR', 'MXIN', 'GTGBP2Y',
       'GTGBP20Y', 'GTDEM2Y', 'US0001M', 'MXBR', 'LP01TREU', 'MXCN'],
      dtype='object')

# Training


In [112]:
from xgboost import XGBClassifier

def find_best_model(X_train, X_test, y_train, y_test):
    """
    Trains multiple models and identifies the one with the highest F1-score.
    """

    models = {
        "Logistic Regression": LogisticRegression(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Naive Bayes": GaussianNB(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Support Vector Machine": SVC(random_state=42),
        "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
    }

    best_model = None
    best_f1_score = 0

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        f1 = report['weighted avg']['f1-score'] # Use weighted average for imbalanced datasets

        print(f"{name} F1-score: {f1:.4f}")

        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    print(f"\nBest Model: {best_model.__class__.__name__} with F1-score: {best_f1_score:.4f}")
    return best_model

# Assuming X_train, X_test, y_train, y_test are already defined
best_model = find_best_model(X_train, X_test, y_train, y_test)

Logistic Regression F1-score: 0.8832
Decision Tree F1-score: 0.8723
Random Forest F1-score: 0.9112
Naive Bayes F1-score: 0.8057
K-Nearest Neighbors F1-score: 0.9061
Support Vector Machine F1-score: 0.8762
XGBoost F1-score: 0.9079

Best Model: RandomForestClassifier with F1-score: 0.9112


In [113]:
import joblib

joblib.dump(best_model, 'model.joblib')

['model.joblib']

# Generate Sim data

In [114]:
sim_data = pd.read_csv('data.csv')

In [116]:
with open('model.joblib', 'rb') as file:
    model = joblib.load(file)


In [118]:
def generate_data(df, ml_trained_col, ml_model, n_periods):
    """
    Generate synthetic data based on a given DataFrame, normalize it,
    and use a pre-trained model to make predictions.

    Parameters:
    df (pd.DataFrame): Input DataFrame with historical data, including 'Date' and 'Y' columns.
    ml_trained_col (list): List of important features used by the trained ML model.
    ml_model: Trained ML model for making predictions.
    n_periods (int): Number of periods (e.g., days, weeks) to generate synthetic data for.

    Returns:
    None
    """
    # df = pd.read_csv('data.csv')
    # Preprocess the 'Date' column
    df.rename(columns={'Data': 'Date'}, inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])

    # Step 1: Calculate min and max for each column
    min_max_values = df.agg(['min', 'max'])

    # Create a dictionary to hold synthetic data for each feature
    sim_dict = {}
    for col in df.columns:
        if col not in ['Date', 'Y']:  # Exclude 'Date' and 'Y' from feature processing
            # Generate random data within the min-max range of each column
            sim_dict[col] = np.random.uniform(
                min_max_values[col]['min'],
                min_max_values[col]['max'],
                n_periods  # Generate specified number of periods
            )

    # Convert synthetic data to a DataFrame
    training_data = pd.DataFrame(sim_dict)

    # Step 2: Calculate statistics (mean, std, min, max) for each feature
    scalers = {}
    for col in training_data.columns:
        if pd.api.types.is_numeric_dtype(training_data[col]):  # Process numeric features only
            scaler = StandardScaler()
            scaler.fit(training_data[[col]])
            scalers[col] = {
                'mean': scaler.mean_[0],
                'std': scaler.scale_[0],
                'min': training_data[col].min(),
                'max': training_data[col].max()
            }

    # Step 3: Generate synthetic data based on feature statistics
    generated_data = {}
    for col in training_data.columns:
        # Generate data within the min-max range for each feature
        generated_data[col] = np.random.uniform(
            low=scalers[col]['min'],
            high=scalers[col]['max'],
            size=n_periods
        )

    # Generate a 'Date' column starting after the last date in the input DataFrame
    generated_data['Date'] = pd.date_range(
        start=df['Date'].max() + pd.Timedelta(weeks=1),
        periods=n_periods,
        freq='W'  # Weekly data; change to 'D' for daily if needed
    )

    # Convert synthetic data to a DataFrame
    new_data = pd.DataFrame(generated_data)

    # Step 4: Normalize the generated data
    normalized_data = new_data.copy()
    for col in new_data.columns:
        if col in scalers:  # Normalize only features present in the scalers dictionary
            normalized_data[col] = (
                new_data[col] - scalers[col]['mean']
            ) / scalers[col]['std']

    # Step 5: Select only the important features used by the trained ML model
    normalized_data_important = normalized_data[ml_trained_col]

    # Step 6: Make predictions using the ML model
    predictions = ml_model.predict(normalized_data_important)
    probability_predictions = ml_model.predict_proba(normalized_data_important)
    # print(f"Generated Data for {n_periods} periods (Raw):\n", new_data)
    # print("\nNormalized Data:\n", normalized_data)
    print("\nModel Predictions:\n", predictions)
    positive_class_probabilities = probability_predictions[:, 1]
    print("\nProbability of Crash Class:\n", positive_class_probabilities)

    return positive_class_probabilities

p1 = generate_data(sim_data, important_features, model, n_periods=4)
p2 = generate_data(sim_data, important_features, model, n_periods=7)
p3 = generate_data(sim_data, important_features, model, n_periods=12)
p4 = generate_data(sim_data, important_features, model, n_periods=36)
p5 = generate_data(sim_data, important_features, model, n_periods=50)


Model Predictions:
 [0 0 0 0]

Probability of Crash Class:
 [0.23 0.26 0.34 0.42]

Model Predictions:
 [0 0 0 0 0 0 0]

Probability of Crash Class:
 [0.24 0.5  0.38 0.46 0.35 0.38 0.42]

Model Predictions:
 [0 0 0 0 0 0 0 0 0 0 0 0]

Probability of Crash Class:
 [0.43 0.37 0.36 0.47 0.44 0.5  0.27 0.26 0.49 0.22 0.43 0.46]

Model Predictions:
 [0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0]

Probability of Crash Class:
 [0.31 0.52 0.36 0.48 0.41 0.48 0.5  0.43 0.68 0.34 0.37 0.55 0.12 0.59
 0.34 0.52 0.36 0.58 0.43 0.53 0.43 0.38 0.42 0.46 0.34 0.37 0.27 0.14
 0.43 0.52 0.44 0.62 0.27 0.45 0.43 0.49]

Model Predictions:
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1
 0 0 0 1 0 1 0 0 0 0 0 0 0]

Probability of Crash Class:
 [0.37 0.27 0.35 0.25 0.18 0.41 0.41 0.37 0.36 0.52 0.25 0.28 0.24 0.45
 0.35 0.46 0.27 0.6  0.45 0.44 0.53 0.51 0.33 0.52 0.53 0.58 0.35 0.27
 0.42 0.44 0.65 0.39 0.45 0.26 0.48 0.43 0.54 0.48 0.42 0.43 0.65 0.4

# MILE STONE 2

In [119]:
!pip install yfinance



In [120]:
def allocate_investment(probabilities, tickers, returns, risk_levels, investment_amount):
    """
    Investment allocation based on crash probabilities, returns, and risk levels with bearish and bullish strategies.

    Args:
        probabilities: List of crash probabilities for each period.
        tickers: List of tickers in the portfolio.
        returns: Dictionary of expected returns per ticker.
        risk_levels: List of risk levels corresponding to each ticker (e.g., 'high', 'moderate', 'low').
        investment_amount: Total amount to invest.

    Returns:
        total_investment: Total invested amount.
        investment_allocation: Dictionary showing how much is invested in each ticker.
    """
    # Define weights for each risk level
    risk_weights = {'high': 0.5, 'moderate': 1.0, 'low': 1.5}
    market_trends = ''

    # Calculate proportional weights based on returns and risk levels
    adjusted_weights = {
        ticker: returns[ticker] * risk_weights[risk]
        for ticker, risk in zip(tickers, risk_levels)
    }

    # Normalize weights to sum to 1
    total_weight = sum(adjusted_weights.values())
    normalized_weights = {ticker: weight / total_weight for ticker, weight in adjusted_weights.items()}

    # Calculate average crash probability
    avg_crash_prob = np.mean(probabilities)

    # Define strategies based on market conditions
    if avg_crash_prob < 0.5:  # Bullish Market
        market_trends = 'Bullish'
        strategy_weights = {'high': 0.6, 'moderate': 0.3, 'low': 0.1}
    else:  # Bearish Market
        market_trends = 'Bearish'
        strategy_weights = {'high': 0.2, 'moderate': 0.3, 'low': 0.5}

    # Adjust normalized weights based on the market strategy
    final_weights = {
        ticker: normalized_weights[ticker] * strategy_weights[risk]
        for ticker, risk in zip(tickers, risk_levels)
    }

    # Normalize the final weights to sum to 1
    total_final_weight = sum(final_weights.values())
    final_weights = {ticker: weight / total_final_weight for ticker, weight in final_weights.items()}

    # Allocate investment based on final weights
    investment_allocation = {
        ticker: final_weights[ticker] * investment_amount
        for ticker in tickers
    }

    # Ensure the total investment matches the investment amount
    total_investment = sum(investment_allocation.values())

    return market_trends, total_investment, investment_allocation


# Example Usage
investment_amount = 10000  # Total amount to invest
tickers = ['AAPL', 'TSLA', 'XLY', 'XLU', 'TLT', 'GLD']
returns = {'AAPL': 0.6, 'TSLA': 1.2, 'XLY': 0.8, 'XLU': 0.3, 'TLT': 0.2, 'GLD': 0.3}
risk_levels = ['high', 'high', 'moderate', 'low', 'low', 'low']
probabilities = generate_data(sim_data, important_features, model, n_periods=4)

market_trends, total_investment, investment_allocation = allocate_investment(
    probabilities, tickers, returns, risk_levels, investment_amount
)
print(f"Market Trend: {market_trends}")
print(f"Total Investment: ${total_investment:.2f}")
print("Investment Allocations per Ticker:")
for ticker, amount in investment_allocation.items():
    print(f"{ticker}: ${amount:.2f}")



Model Predictions:
 [0 1 1 0]

Probability of Crash Class:
 [0.35 0.51 0.53 0.35]
Market Trend: Bullish
Total Investment: $10000.00
Investment Allocations per Ticker:
AAPL: $2000.00
TSLA: $4000.00
XLY: $2666.67
XLU: $500.00
TLT: $333.33
GLD: $500.00


# Milestone 3

In [121]:
from google.colab import userdata
groq_api_key = userdata.get('GROQ_API_KEY')

if groq_api_key:
    print("Groq API key loaded successfully.")
    # Now you can use the groq_api_key variable in your code
else:
    print("Error: Groq API key not found in secrets.")

Groq API key loaded successfully.


In [122]:
llm = Groq(model="llama-3.3-70b-versatile", api_key=groq_api_key)

In [123]:
strategy = """This strategy takes into account market conditions, individual asset risks, and returns to allocate investments intelligently. Here's a step-by-step explanation:
1. Understanding Inputs:
•Probabilities: These are predicted probabilities of a market crash for upcoming periods.
•Tickers: The stock or asset identifiers (e.g., AAPL, TSLA).
•Returns: Expected returns for each asset.
•Risk Levels: Assets are categorized into risk levels (high, moderate, low).
•Investment Amount: The total amount of money you want to invest.
2. Market Trend Analysis:
•The average crash probability is calculated.
•If the average probability is less than 50%, the market is considered bullish (optimistic growth 'outlook').
•If the probability is 50% or more, the market is considered bearish (higher risk of downturn).
3. Define Strategy for Each Market Trend:
•Bullish Market (Optimistic):
•Prioritize high-risk assets for growth potential: 60% to high-risk, 30% to moderate-risk, and 10% to low-risk.
•Bearish Market (Cautious):
•Shift focus to low-risk assets: 20% to high-risk, 30% to moderate-risk, and 50% to low-risk.
4. Adjust Asset Weights:
•Each asset's weight is initially calculated based on its expected returns and risk level:
•Higher returns increase the weight.
•Higher risk decreases the weight by multiplying it with a risk factor:
•High-risk assets are scaled down (factor = 0.5).
•Moderate-risk assets remain neutral (factor = 1.0).
•Low-risk assets are scaled up (factor = 1.5).
•These adjusted weights are normalized so they sum to 1.
5. Apply the Market Strategy:
•The weights are modified further based on the current market trend.
•For example:
•In a bearish market, low-risk assets receive a higher allocation according to the strategy.
6. Allocate Investments:
•Using the final weights, the total investment amount is distributed across the tickers proportionally.
•The strategy ensures that all available funds are allocated while respecting the calculated weights.
Example Walkthrough:
Let's say:
•Investment Amount: $10,000
•Tickers: AAPL, TSLA, XLY, XLU, TLT, GLD
•Returns: {AAPL: 0.6, TSLA: 1.2, XLY: 0.8, XLU: 0.3, TLT: 0.2, GLD: 0.3}
•Risk Levels: [high, high, moderate, low, low, low]
•Probabilities: [0.2, 0.3, 0.4, 0.2] (average crash probability: 0.275 → Bullish market).
Result:
1.Market Trend: Bullish.
2.Investment Allocation:
•High-risk assets (AAPL, TSLA): 60% of investments, distributed proportionally to their weights.
•Moderate-risk asset (XLY): 30% of investments.
•Low-risk assets (XLU, TLT, GLD): 10% of investments.
3.Total Investment: $10,000 distributed as per the above proportions.
"""

system_prompt = f"""You are a financial assistant AI designed to help users understand market crash predictions and data-driven investment strategies. Your goal is to explain complex financial concepts in simple, actionable terms. You should provide clear and concise, responses tailored to the user's level of expertise.
You are using {strategy} this strategy to make investment. Remember to use strategy while explaining how a person should make investments."""

In [124]:
chat_question = f"Explain the how the function allocate_investment chose to make investments "
messages = [
    ChatMessage(
        role="system", content=system_prompt
    ),
    ChatMessage(role="user", content=chat_question),
]
resp = llm.chat(messages)
print(resp)

assistant: The `allocate_investment` function uses a data-driven strategy to allocate investments across different assets. Here's a step-by-step breakdown of how it works:

1. **Understanding Inputs**: The function takes in several inputs, including:
	* `probabilities`: Predicted probabilities of a market crash for upcoming periods.
	* `tickers`: Stock or asset identifiers (e.g., AAPL, TSLA).
	* `returns`: Expected returns for each asset.
	* `risk_levels`: Assets are categorized into risk levels (high, moderate, low).
	* `investment_amount`: The total amount of money to invest.
2. **Market Trend Analysis**: The function calculates the average crash probability and determines the market trend:
	* If the average probability is less than 50%, the market is considered **bullish** (optimistic growth outlook).
	* If the probability is 50% or more, the market is considered **bearish** (higher risk of downturn).
3. **Define Strategy for Each Market Trend**: Based on the market trend, the funct

In [125]:
def follow_up(question):
  messages.append(ChatMessage(role="user", content=question))
  new_resp = llm.chat(messages)
  print(new_resp)

In [126]:
q1 = "what is a bearish trend? How should I make investments during the bearish trend?"

In [127]:
follow_up(q1)

assistant: **Understanding Bearish Trend:**
A bearish trend, also known as a bear market, is a period of time when the overall market is expected to decline or is already declining. This is characterized by a higher probability of a market crash, typically above 50%. During a bearish trend, investors are cautious, and the market sentiment is negative.

**Investment Strategy during Bearish Trend:**
When the market is bearish, our investment strategy shifts focus to low-risk assets to minimize potential losses. According to our strategy, during a bearish trend:

1. **Low-risk assets** receive a higher allocation: 50% of the total investment amount.
2. **Moderate-risk assets** receive a moderate allocation: 30% of the total investment amount.
3. **High-risk assets** receive a lower allocation: 20% of the total investment amount.

**Why this allocation?**
By allocating a larger portion of the investment to low-risk assets, we aim to reduce the overall risk of the portfolio and protect agai

In [128]:
q2 = "Suggest some other stocks to invest in"

In [129]:
follow_up(q2)

assistant: I'd be happy to explain how the `allocate_investment` function works and provide guidance on investing during a bearish trend.

**How the `allocate_investment` function works:**

The `allocate_investment` function takes into account market conditions, individual asset risks, and returns to allocate investments intelligently. Here's a step-by-step breakdown:

1. **Understanding Inputs**: The function receives inputs such as probabilities of a market crash, tickers (stock or asset identifiers), expected returns, risk levels, and the total investment amount.
2. **Market Trend Analysis**: The function calculates the average crash probability and determines the market trend. If the average probability is less than 50%, the market is considered bullish (optimistic growth outlook). If the probability is 50% or more, the market is considered bearish (higher risk of downturn).
3. **Define Strategy for Each Market Trend**: Based on the market trend, the function defines a strategy for