## Building the dataset that will be input into the model

In [1]:
import findspark
import pandas as pd
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

# for shared metastore (shared across all users)
spark = SparkSession.builder.appName("Building dataset").config("hive.metastore.uris", "thrift://bialobog:9083", conf=SparkConf()).getOrCreate() \

# for local metastore (your private, invidivual database) add the following config to spark session
spark.sql("USE 2023_04_01")

AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient

In [None]:
import pyspark.pandas as ps
from pyspark.sql.functions import lit,col
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
#from boruta import BorutaPy
#from fredapi import Fred
from sklearn.linear_model import Lasso
from sklearn.model_selection import TimeSeriesSplit
import csv
from pyspark.sql import functions as F
from functools import reduce
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from CreateDataset import get_features_all_stocks_seq, get_full_series_stocks

In [None]:
df = pd.read_csv('imploded_stocks_price.csv', index_col=False)
#full_series_stocks = get_full_series_stocks(df) #this gets the stocks that have data since 2001
#filtered_df = df[df['fsym_id'].isin(full_series_stocks)]
df = get_features_all_stocks_seq(df) #get stocks that have data in the ff_advanced_der_af for all years, not just prices (or not even)
#for boruta, maybe treat each column separately 22*10 features?
#df.show()
print(df.count())

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def train_test_split(df):
    train, test = df.randomSplit([0.7,0.3], 22)
    return train, test


def confusion_matrix_pandas(df):
    df = df.toPandas()
    cm = confusion_matrix(df['label'], df['prediction'])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


def model_testing(df):
    df = df.filter(reduce(lambda acc, column: acc & (F.size(col(column)) == 22), df.columns[1:-1], lit(True)))

    # average_lengths = df.agg(*[(F.avg(F.size(col(column))).alias(f'avg_length_{column}')) for column in df.columns[1:-1]])
    
    # test = padded_df.select('ff_non_oper_exp').filter(col('fsym_id')=='RTTY5P-R').collect()[0]
    # print(test['ff_non_oper_exp'], len(test['ff_non_oper_exp']))

    #need to decide whether to only include stocks that started from 2000, or include just from e.g. 2019
    #temporary measure - replace with 0
    #try imputer?
    #look into masking
    print("Number of records: ", df.count())
    features = df.columns[1:-1]
    list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
    for f in features:
        df = df.withColumn(f, list_to_vector_udf(f))
    df.show(2)
    train_df, test_df = train_test_split(df)
    
    vector_assembler = VectorAssembler(inputCols=features, outputCol="features_vector")
    train_df = vector_assembler.transform(train_df)
    test_df = vector_assembler.transform(test_df)
    
    lr = LogisticRegression(featuresCol="features_vector", labelCol="label")
    rf = RandomForestClassifier(featuresCol='features_vector', labelCol='label')
    gbt = GBTClassifier(featuresCol = 'features_vector', labelCol='label')
    models = [lr, rf, gbt]
    model_names = ['random forest']
    
    for model, model_name in zip(models, model_names):
        if model_name == 'random forest'
            paramGrid = ParamGridBuilder() \
                .addGrid(model.numTrees, [50, 100, 150]) \
                .addGrid(model.maxDepth, [5, 10, 15]) \
                .addGrid(model.minInstancesPerNode, [1, 5, 10]) \
                .build()
        elif model_name == 'logistic regression':
            paramGrid = ParamGridBuilder() \
                .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
                .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
                .addGrid(lr.maxIter, [50, 100, 150]) \
                .build()
        
        paramGrid = ParamGridBuilder() \
            .addGrid(gbt.maxDepth, [5, 10, 15]) \
            .addGrid(gbt.maxBins, [16, 32]) \
            .addGrid(gbt.maxIter, [10, 20]) \
            .build()

        evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

        crossval = CrossValidator(estimator=model,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5, parallelism=)

        cvModel = crossval.fit(train_df)

        avg_metrics = cvModel.avgMetrics
        print(model_name.upper())

        for i, acc in enumerate(avg_metrics):
            print(f"Fold {i + 1} - Validation Accuracy: {acc}")

        best_model = cvModel.bestModel

        predictions = best_model.transform(test_df)

        confusion_matrix_pandas(predictions.select('label', 'prediction'))
        
def basic_test(df):
    df = df.filter(reduce(lambda acc, column: acc & (F.size(col(column)) == 22), df.columns[1:-1], lit(True)))
    print("Number of records: ", df.count())
    features = df.columns[1:-1]
    list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
    for f in features:
        df = df.withColumn(f, list_to_vector_udf(f))
    df.show(2)
    train_df, test_df = train_test_split(df)
    
    vector_assembler = VectorAssembler(inputCols=features, outputCol="features_vector")
    train_df = vector_assembler.transform(train_df)
    test_df = vector_assembler.transform(test_df)
    
    lr = LogisticRegression(featuresCol="features_vector", labelCol="label")
    rf = RandomForestClassifier(featuresCol='features_vector', labelCol='label')
    gbt = GBTClassifier(featuresCol = 'features_vector', labelCol='label')
    models = [lr, rf, gbt]
    model_names = ['logistic regression']
    
    for model, model_name in zip(models, model_names):
        if model_name == 'boosted trees':
            paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [10])
             .addGrid(gbt.maxIter, [50])
             .build())
        elif model_name == 'random forest':
            paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [10]) #5,10,15
             .addGrid(rf.numTrees, [50]) #20,50,100
             .build())
        else:
            paramGrid = (ParamGridBuilder()
             .addGrid(lr.maxIter, [10])  # Number of iterations #10,50,100
             .addGrid(lr.regParam, [0.01])  # Regularization parameter #0.01,0.1,0.5
             .addGrid(lr.elasticNetParam, [0.0])  # Elastic net parameter (0 for L2, 1 for L1) 0.0,0.5,1.0
             .build())
    


        evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
        
        crossval = CrossValidator(estimator = model, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
        
        model = crossval.fit(train_df)
        
        best_model = model.bestModel

        print(model_name.upper())

        predictions = best_model.transform(test_df)

        confusion_matrix_pandas(predictions.select('label', 'prediction'))
        
        recall = evaluator.evaluate(predictions)
        print(f"Recall: {recall}")
    


basic_test(df)
#test_pandas(df)

In [None]:
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

def test_model_pandas(df1, model_name):
    df1 = df1.toPandas()
    exclude_columns = ['fsym_id', 'label']

    
    print("Number of records: ", len(df))
    X = df.drop(exclude_columns, axis=1)
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print(X_train)
    if model_name == 'LogisticRegression':
        model = LogisticRegression()
        param_grid = {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 10.0],
            'max_iter': [100, 200, 300]
        }
    elif model_name == 'SVM':
        model = SVC()
        param_grid = {
            'C': [0.1, 1.0, 10.0],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 'auto']
        }
    elif model_name == 'RandomForest':
        model = RandomForestClassifier()
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10, 20]
        }
    else:
        raise ValueError("Invalid model name")

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    predictions = best_model.predict(X_test)

    auc = roc_auc_score(y_test, predictions)

    print(f"Area under the ROC curve (AUC) for {model_name}: {auc}")
    print(f"Best model hyperparameters for {model_name}:")
    print(grid_search.best_params_)
    
#test_model_pandas(df, 'LogisticRegression')

In [None]:
print(filtered_df.filter(F.col('label')==1).count())

In [None]:
df.filter(F.col('label')==1).count()