In [7]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import StandardScaler
import pickle

In [10]:


def fetch_stock_data(ticker, start_date, end_date):
    stock = yf.Ticker(ticker)
    df = stock.history(start=start_date, end=end_date)
    return df

def calculate_rsi(prices, window=14):
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(prices, slow=26, fast=12, signal=9):
    exp1 = prices.ewm(span=fast, adjust=False).mean()
    exp2 = prices.ewm(span=slow, adjust=False).mean()
    macd = exp1 - exp2
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    return macd - signal_line

def calculate_bollinger_bands(prices, window=20, num_std=2):
    rolling_mean = prices.rolling(window=window).mean()
    rolling_std = prices.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def create_features(df):
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['SMA_50'] = df['Close'].rolling(window=50).mean()
    df['RSI'] = calculate_rsi(df['Close'])
    df['MACD'] = calculate_macd(df['Close'])
    df['BB_high'], df['BB_low'] = calculate_bollinger_bands(df['Close'])
    df['Volatility'] = df['Close'].pct_change().rolling(window=20).std()
    df['Momentum'] = df['Close'].pct_change(periods=10)
    return df

def create_labels(df, future_days=5, threshold=0.02):
    df['Future_Return'] = df['Close'].pct_change(periods=future_days).shift(-future_days)
    df['Label'] = np.where(df['Future_Return'] > threshold, 'buy',
                           np.where(df['Future_Return'] < -threshold, 'sell', 'hold'))
    return df

def get_recommendation(model, latest_data, scaler):
    scaled_data = scaler.transform(latest_data)
    prediction = model.predict(scaled_data)[0]
    confidence = np.max(model.predict_proba(scaled_data)[0])
    return prediction, confidence

# Main execution
if __name__ == "__main__":
    # Fetch data
    ticker = 'AAPL'
    start_date = '2010-01-01'
    end_date = '2023-01-01'
    df = fetch_stock_data(ticker, start_date, end_date)

    # Prepare features and labels
    df = create_features(df)
    df = create_labels(df)
    df = df.dropna()

    # Prepare data for training
    features = ['SMA_20', 'SMA_50', 'RSI', 'MACD', 'BB_high', 'BB_low', 'Volatility', 'Momentum']
    X = df[features]
    y = df['Label']

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Define parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create a custom scorer that uses macro F1-score
    f1_scorer = make_scorer(f1_score, average='macro')

    # Perform GridSearchCV
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Evaluate model
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Example usage of get_recommendation
    latest_data = X.iloc[-1:] # Get the most recent data point
    recommendation, confidence = get_recommendation(best_model, latest_data, scaler)
    print(f"Recommendation: {recommendation}")
    print(f"Confidence: {confidence:.2f}")

    # Feature importance
    feature_importance = pd.DataFrame({'feature': features, 'importance': best_model.feature_importances_})
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    print("\nFeature Importance:")
    print(feature_importance)

    # Save the model as a pickle file
    with open('stock_prediction_model.pkl', 'wb') as file:
        pickle.dump((best_model, scaler), file)
    print("\nModel saved as 'stock_prediction_model.pkl'")

              precision    recall  f1-score   support

         buy       0.69      0.69      0.69       212
        hold       0.64      0.68      0.66       270
        sell       0.77      0.67      0.72       162

    accuracy                           0.68       644
   macro avg       0.70      0.68      0.69       644
weighted avg       0.69      0.68      0.68       644

Recommendation: hold
Confidence: 0.79

Feature Importance:
      feature  importance
6  Volatility    0.135891
1      SMA_50    0.131752
5      BB_low    0.128209
4     BB_high    0.128030
0      SMA_20    0.125910
3        MACD    0.120908
7    Momentum    0.118293
2         RSI    0.111008

Model saved as 'stock_prediction_model.pkl'
