## IF Approach

In [None]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("Seq").config("hive.metastore.uris", "thrift://amok:9083", conf=SparkConf()).getOrCreate() \

# for local metastore (your private, invidivual database) add the following config to spark session
spark.sql("USE 2023_11_02")

In [None]:
#import pyspark.pandas as ps
from pyspark.sql.functions import lit,col
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
#from boruta import BorutaPy
#from fredapi import Fred
from sklearn.linear_model import Lasso
from sklearn.model_selection import TimeSeriesSplit
import csv
from pyspark.sql import functions as F
from functools import reduce
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from CreateDataset import get_full_seqs, get_seq_means

## Aggregating Time Series

In [None]:
df = pd.read_csv('imploded_stocks_price.csv', index_col=False)
df['Implosion_Start_Date'] = pd.to_datetime(df['Implosion_Start_Date'])
df['Implosion_End_Date'] = pd.to_datetime(df['Implosion_End_Date'])
df = get_seq_means(df)

In [None]:
df.show()

In [None]:
print(df.count())
print(df.filter(col('label') == 1).count())

In [None]:
null_counts = df.select([F.sum(col(column).isNull().cast("int")).alias(column + "_null_count") for column in df.columns])
result_df = null_counts.toPandas()
result_df = result_df.transpose()
result_df.columns = ['Null Count']
print(result_df)

In [None]:
df = df.drop('ff_zscore', 'ff_mkt_val_gr','ff_mkt_val_public', 'ff_emp_gr', 'ff_sales_fix_assets', 'ff_cf_ps_gr', 'ff_shs_float', 'ff_sga_oth', 'ff_fcf_yld', 'ff_net_inc_per_emp')

In [None]:
print(df.count())
print(df.dropna().count())
print(df.dropna().filter(col('label')==1).count())

In [None]:
df=df.dropna()
print(df.count())
print(df.filter(col('label')==1).count())

### Helper Functions

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def t_t_split(df):
    train, test = df.randomSplit([0.8,0.2])
    return train, test


def confusion_matrix_pandas(df):
    df = df.toPandas()
    cm = confusion_matrix(df['label'], df['prediction'])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    

In [None]:
from hyperopt import fmin, tpe, hp
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import csv
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from itertools import chain
    
def model_training(df, classifier):
    
    print("Number of records: ", df.count())
    
    features = df.columns[1:-1]
    
    train_df, test_df = t_t_split(df)
    
        
    def compute_weights(train_df):
        y_collect = train_df.select("label").groupBy("label").count().collect()
        unique_y = [x["label"] for x in y_collect]
        total_y = sum([x["count"] for x in y_collect])
        unique_y_count = len(y_collect)
        bin_count = [x["count"] for x in y_collect]

        class_weights_spark = {i: ii for i, ii in zip(unique_y, total_y / (unique_y_count * np.array(bin_count)))}
        print(class_weights_spark)
        mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])
        train_df = train_df.withColumn("weight", mapping_expr.getItem(F.col("label")))
        return train_df
        
    train_df = compute_weights(train_df)
    
    vector_assembler = VectorAssembler(inputCols=features, outputCol="features_vector")
    train_df = vector_assembler.transform(train_df)
    test_df = vector_assembler.transform(test_df)

    if classifier == 'LogisticRegression':
        param_space = {
            'regParam': hp.uniform('regParam', 0.01, 1.0),
            'elasticNetParam': hp.uniform('elasticNetParam', 0.0, 1.0)
        }
        classifier_instance = LogisticRegression(featuresCol="features_vector", labelCol="label", weightCol='weight')
    elif classifier == 'RandomForest':
        param_space = {
            'maxBins': hp.quniform('maxBins', 16, 32, 1),
            'maxDepth': hp.quniform('maxDepth', 20, 30, 1)
        }
        classifier_instance = RandomForestClassifier(featuresCol='features_vector', labelCol='label', weightCol='weight')
    elif classifier == 'GBT':
        param_space = {
            'maxDepth' : hp.quniform("maxDepth", 3, 18, 1),
            'maxBins': hp.quniform('maxBins', 16, 32, 1)
        }
        classifier_instance = GBTClassifier(featuresCol='features_vector', labelCol='label')
    else:
        raise ValueError("Unsupported classifier")
    
    initial_model = classifier_instance
    initial_model = initial_model.fit(train_df)

    def cross_val_train(params):
        classifier_instance.setParams(**params)
        evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
        train, val = train_df.randomSplit([0.9,0.1])
        curr_model = classifier_instance.fit(train)
        predictions = curr_model.transform(val)
        val_metric = evaluator.evaluate(predictions)
        return curr_model, val_metric
#         crossval = CrossValidator(estimator=classifier_instance,
#                                   estimatorParamMaps=[params],
#                                   evaluator=evaluator,
#                                   numFolds=5, parallelism=12)
        
#         cv_model = crossval.fit(train_df)
#         predictions = cv_model.transform(train_df)
#         val_metric = evaluator.evaluate(predictions)
#         return cv_model, val_metric
    
    
    def objective(params):
        model, metric = cross_val_train(params)
        return -metric

    # Find the best hyperparameters
    best_params = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=3)
    print("Best hyperparameters: ", best_params)

    # Train the model with the best hyperparameters
    best_model, final_metric = cross_val_train(best_params)
            
    
    predictions = best_model.transform(test_df)
    true = predictions.select('label').toPandas()
    preds = predictions.select('prediction').toPandas()
    print(classification_report(true, preds))
    
    cm = confusion_matrix(true, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", linewidths=.5)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    return best_model, train_df, test_df
    
model, train_df, test_df = model_training(df, 'GBT')

In [None]:
import shap

def feat_analysis(model):
    features = df.columns[1:-1]
    feature_importances = model.featureImportances
    feature_importances = feature_importances.toArray()
    sorted_idx = np.argsort(feature_importances)[::-1]
    sorted_features = [features[i] for i in sorted_idx]

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(feature_importances)), feature_importances[sorted_idx], align="center")
    plt.xticks(range(len(feature_importances)), sorted_features, rotation=45, ha="right")
    plt.xlabel("Feature")
    plt.ylabel("Feature Importance")
    plt.title("Feature Importances")
    plt.show()
    
    
def shapley(model, train, test):
    exclude_columns = ['fsym_id',  'label', 'features_vector']
    train = train.toPandas()
    test = test.toPandas()
    X_train = train.drop(exclude_columns, axis=1)
    X_test = test.drop(exclude_columns, axis=1)
    explainer = shap.Explainer(model)
    shap_values = explainer.shap_values(X_test)
    shap.initjs()
    # print(shap_values.shape)
    shap.summary_plot(shap_values, X_test)
    
    
# feat_analysis(model)
shapley(model, train_df, test_df)

In [None]:
from sklearn.ensemble import IsolationForest

def anomaly_det(df):
    train_df, test_df = t_t_split(df)
    features = df.columns[1:-1]
    train_df = train_df.toPandas()
    test_df = test_df.toPandas()
    print("Converted to Pandas")
    
    num_pos = len(train_df[train_df['label']==1])
    isol_for = IsolationForest(contamination=num_pos/len(train_df), random_state=41)
    isol_for.fit(train_df[features])
    train_df['anomaly_scores'] = isol_for.decision_function(train_df[features])
    train_df['anomaly'] = isol_for.predict(train_df[features])
    train_df['preds'] = np.where(train_df['anomaly'] == 1, 0, 1)
    
    test_df['anomaly_scores'] = isol_for.decision_function(test_df[features])
    test_df['anomaly'] = isol_for.predict(test_df[features])
    test_df['preds'] = np.where(test_df['anomaly'] == 1, 0, 1)
    
    print(train_df)
    print(f"Classification Report: ")
    print(classification_report(test_df['label'], test_df['preds']))
    cm = confusion_matrix(test_df['label'], test_df['preds'])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", linewidths=.5)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    
    
anomaly_det(df)

In [None]:
df = pd.read_csv('imploded_stocks_price.csv', index_col=False)
df['Implosion_Start_Date'] = pd.to_datetime(df['Implosion_Start_Date'])
df['Implosion_End_Date'] = pd.to_datetime(df['Implosion_End_Date'])
df = get_full_seqs(df)

In [None]:
df.show()

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report


def nn_prepare_seqs(df):
    # print(df.columns)
    print("Number of records: ", df.count())
    df = df.filter(reduce(lambda acc, column: acc & (F.size(col(column)) == 22), df.columns[1:-1], F.lit(True)))
    print("Number of records: ", df.count())
    features = df.columns[1:-1]
    print(features)
    label_col = 'label'

    def convert_to_np_array(row):
        seq_feats = row.seq_feats
        label = row.label
        feat_length = len(seq_feats[0])
        date_length = len(seq_feats[0][0])
        seq_array = np.zeros((feat_length, date_length))
        for i, sublist in enumerate(seq_feats[0]):
            seq_array[i, :] = np.array(sublist)
        seq_array = seq_array.T
        return (seq_array, label)
    
    # train_seqs_rdd = grouped_data.rdd.map(convert_to_np_array)
    train_df, test_df = t_t_split(df)
    
    # train_seqs = []
    # for stock_id, group in df.groupby('fsym_id'):
    #     seq_feats = group[features]
    #     label = group['label']
    #     feat_length = len(seq_feats.iloc[0])
    #     date_length = len(seq_feats.iloc[0,0])
    #     seq_array = np.zeros((feat_length, date_length))
    #     for i, sublist in enumerate(seq_feats.columns):
    #         seq_array[i, :] = np.array(seq_feats[sublist].iloc[0])
    #     seq_array = seq_array.T
    #     train_seqs.append((seq_array, label))
        
        
    # df= df.toPandas()
    # print(df.head())
    # features = df.columns[1:-1]
    # list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
    # df = df.toPandas()
    
    
    
    # for f in features:
    #     df = df.withColumn(f, list_to_vector_udf(f))
    # train_df, test_df = train_test_split(df)
    train_df = train_df.toPandas()
    test_df = test_df.toPandas()
    
    train_seqs = []
    for stock_id, group in train_df.groupby('fsym_id'):
        seq_feats = group[features]
        label = group['label']
        feat_length = len(seq_feats.iloc[0])
        date_length = len(seq_feats.iloc[0,0])
        seq_array = np.zeros((feat_length, date_length))
        for i, sublist in enumerate(seq_feats.columns):
            seq_array[i, :] = np.array(seq_feats[sublist].iloc[0])
        seq_array = seq_array.T
        train_seqs.append((seq_array, label))
        
    test_seqs = []
    for stock_id, group in test_df.groupby('fsym_id'):
        seq_feats = group[features]
        label = group['label']
        feat_length = len(seq_feats.iloc[0])
        date_length = len(seq_feats.iloc[0,0])
        seq_array = np.zeros((feat_length, date_length))
        for i, sublist in enumerate(seq_feats.columns):
            seq_array[i, :] = np.array(seq_feats[sublist].iloc[0])
        seq_array = seq_array.T
        test_seqs.append((seq_array, label))
    
    return train_seqs, test_seqs

def plot_model_performance(mdl, loss, metric):
    x = pd.DataFrame(mdl.history).reset_index()
    x = pd.melt(x, id_vars='index')
    x['validation'] = (x['variable'].str[:4] == 'val_').replace({True:'validation',False:'training'})
    x['loss'] = (x['variable'].str[-4:] == 'loss').replace({True:loss,False:metric})
    g = sns.FacetGrid(x, col='loss', hue='validation',sharey=False)
    g.map(sns.lineplot, 'index','value')
    g.add_legend()
    return g

def nn_training(train_seqs, test_seqs):
    train_X, train_y = zip(*train_seqs)
    test_X, test_y = zip(*test_seqs)

    # Convert lists to numpy arrays
    train_X = np.array(train_X)
    train_y = np.array(train_y)
    test_X = np.array(test_X)
    test_y = np.array(test_y)
    print(np.sum(test_y==1))
    
    class_labels = np.unique(train_y)
    class_weights = compute_class_weight('balanced', classes=class_labels, y=train_y.flatten())
    class_weight_dict = dict(zip(class_labels, class_weights))
    print(class_weight_dict)
    

    # Define the neural network model
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(train_X.shape[1], train_X.shape[2])),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),  # Additional Dense layer
        tf.keras.layers.Dropout(0.5),  # Dropout layer for regularization
        tf.keras.layers.Dense(16, activation='relu'),  # Another Dense layer
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    loss_fn = keras.losses.BinaryCrossentropy()
    optimizer = keras.optimizers.Adam(
        learning_rate=0.01
    )

    
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

    # Train the model
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    fit_model = model.fit(train_X, train_y, epochs=50, batch_size=32, validation_split=0.1, class_weight = class_weight_dict, callbacks=[early_stopping])
    plot_model_performance(fit_model, 'bin_cross_entropy','accuracy')

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(test_X, test_y)
    print(f'Test accuracy: {test_acc}')

    # Make predictions on new data
    predictions = model.predict(test_X)
    for i in range(len(predictions)):
        predictions[i] = 1 if predictions[i] >= 0.5 else 0
    print(classification_report(predictions, test_y.flatten()))
    
    # pred_df = pd.DataFrame()
    # pred_df['prediction'] = predictions
    # pred_df['label'] = test_y
    # confusion_matrix_pandas(pred_df)
    cm = confusion_matrix(test_y, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    

train_seqs, test_seqs = nn_prepare_seqs(df)
nn_training(train_seqs, test_seqs)

In [None]:
# spark.stop()