In [1]:
!pip install imbalanced-learn
!pip install --upgrade pandas
!pip install optuna
!pip install xgboost
!pip install python-dotenv



In [2]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTENC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pickle
import optuna
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

In [3]:
from all_functions import load_and_analyze_data, rank_columns_by_correlation, remove_correlated_features, \
                           split_data, preprocess_data, perform_stratified_cv_with_smote, \
                           feature_importance_analysis, apply_pca, load_models, evaluate_models

In [10]:
def main_pipeline(stock_ticker, split_year):

    # Analyze for Correlations and Remove Highly Correlated Features
    correlation_df = rank_columns_by_correlation(num_dataset, 0.8)
    reduced_dataset = remove_correlated_features(full_dataset, correlation_df)

    # Split Data
    X_train, y_train, X_test, y_test = split_data(reduced_dataset, split_year)

    # Preprocess Data
    categorical_columns = ['industry', 'sector', 'symbol']  # Example categorical columns
    X_train_transformed, X_test_transformed = preprocess_data(X_train, X_test, categorical_columns)

    # Feature Importance Analysis and PCA
    selected_features, importance_table = feature_importance_analysis(X_train_transformed, y_train, 0.60)
    X_train_pca, X_test_pca, _ = apply_pca(X_train_transformed, X_test_transformed, selected_features)

    # Load Models and Evaluate
    models = {'Logistic Regression': 'lr.pkl'}
    models = load_models(models)
    result_df, _ = evaluate_models(models, X_train_pca, y_train, X_test_pca, y_test)

    # Join Results with Symbols and Output Specific Prediction
    result_df_with_symbols = result_df.join(X_test['symbol'])
    if stock_ticker in result_df_with_symbols['symbol'].values:
        stock_rows = result_df_with_symbols[result_df_with_symbols['symbol'] == stock_ticker]
        logistic_regression_prediction = stock_rows['Predicted_Prob_Logistic Regression'].iloc[1]
        print(f"The Logistic Regression predicted probability for '{stock_ticker}' to decrease dividend is: {logistic_regression_prediction:.4f}")
    else:
        print(f"Stock ticker '{stock_ticker}' is not available. Please enter a valid ticker from the S&P 500 list.")


In [11]:
# Load Data
num_dataset, full_dataset = load_and_analyze_data('zero_div_recent_years.csv')
if __name__ == "__main__":
    while True:
        stock_ticker = input("Enter the stock ticker: ").upper()
        split_year = 2023
        # Assuming full_dataset is globally accessible or passed somehow
        if stock_ticker in full_dataset['symbol'].unique():
            main_pipeline(stock_ticker, split_year)
            break
        else:
            print("Invalid ticker. Please enter a ticker from the S&P 500 list.")


Enter the stock ticker: AAPL
The Logistic Regression predicted probability for 'AAPL' to decrease dividend is: 0.0104
