## SVM, Linear Regression, Random Forest, Gradient Boosting,

In [98]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [3]:
spark = SparkSession.builder.appName("training").getOrCreate()

24/05/14 18:00:38 WARN Utils: Your hostname, alber-victus resolves to a loopback address: 127.0.1.1; using 192.168.1.25 instead (on interface wlp4s0)
24/05/14 18:00:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/14 18:00:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/14 18:00:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [122]:
def split_data_for_ml(stock,train_size=0.8,spark=None,emotion=False,classification=False):



    schema = StructType([

    StructField('date',StringType(),True),
    StructField('afinn_sentiment',FloatType(),True),
    StructField('pnn_sentiment',FloatType(),True),
    StructField('price_percent_change',FloatType(),True),
    StructField('volume_percent_change',FloatType(),True),
    StructField('next_day_price_percent_change_shifted',FloatType(),True)
    
    ])
    
    # 'date', 'afinn_sentiment', 'pnn_sentiment', 'price_percent_change', 'volume_percent_change', 'next_day_price_percent_change_shifted'
    df = spark.read.schema(schema).csv("../data/csv/"+stock+"/")

    # scale volume

        # scale volumne 
    assembler = VectorAssembler(inputCols=['volume_percent_change'], outputCol="features")

    # Transform the data
    data = assembler.transform(df)
    
    # Initialize the StandardScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaled_volume_percent_change", withMean=True, withStd=True)
    
    # Compute summary statistics by fitting the StandardScaler
    scaler_model = scaler.fit(data)
    
    # Scale features
    scaled_data = scaler_model.transform(data)
    
    firstelement = F.udf(lambda v:float(v[0]),FloatType())
    df = scaled_data.withColumn("volume_percent_change", firstelement("scaled_volume_percent_change"))
    
    df = df.select('date',
     'afinn_sentiment',
     'pnn_sentiment',
     'price_percent_change',
     'volume_percent_change',
     'next_day_price_percent_change_shifted').withColumnRenamed("next_day_price_percent_change_shifted","label")

    
    
    n = df.count()
    train_size = int(n*train_size)

    train_data = df.limit(train_size)
    test_data = df.subtract(train_data)

    if emotion:

        
        assembler = VectorAssembler(inputCols=['afinn_sentiment', 'pnn_sentiment', 'price_percent_change', 'volume_percent_change'], \
                                    outputCol="features")

    else:
      
        assembler = VectorAssembler(inputCols=['price_percent_change', 'volume_percent_change'], \
                                    outputCol="features")


    train_data = Pipeline(stages=[assembler]).fit(train_data).transform(train_data)
    test_data = Pipeline(stages=[assembler]).fit(test_data).transform(test_data)

    
    # X_train = train_data.select("features")
    # y_train = train_data.select("label")

    # y_train_cat = y_train.withColumn("label", \
    #                                  F.when(F.col("label") >0, 1).otherwise(0))
    # X_test = test_data.select("features")
    # y_test = test_data.select("label")

    # y_test_cat = y_test.withColumn("label", \
    #                                  F.when(F.col("label") >0, 1).otherwise(0))

    train_data = train_data.select("features","label")
    test_data = test_data.select("features","label")

    if classification:
        train_data = train_data.withColumn("label", \
                                     F.when(F.col("label") >0, 1).otherwise(0))
        
        test_data = test_data.withColumn("label", \
                                     F.when(F.col("label") >0, 1).otherwise(0))
        
    return train_data,test_data
    

In [136]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import StringIndexer
from pyspark.mllib.tree import RandomForest, LabeledPoint


In [137]:
train_rdd = train_data.rdd.map(lambda x: LabeledPoint(x[-1],x[:-1]))


## Linear regression

In [138]:
def fit_lr(stock,emotion,spark):

    train_data,test_data = split_data_for_ml(stock,spark=spark,emotion=emotion,classification=False)
    
    ln_model = LinearRegression(maxIter=1000)
    lr_model = ln_model.fit(train_data)
    
    # Step 6: Evaluate Model
    predictions = lr_model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)

    print(stock,str(emotion),rmse,r2)


In [139]:
for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        # model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        # results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        # key = stock + "_emotion_" + str(emotion)

        fit_lr(stock,emotion,spark)
        
        




24/05/14 21:05:16 WARN Instrumentation: [bc706089] regParam is zero, which might cause numerical instability and overfitting.


Apple False 1.5498124398532114 -0.020396215477227697


24/05/14 21:05:18 WARN Instrumentation: [e06f8c30] regParam is zero, which might cause numerical instability and overfitting.


Apple True 1.5495442885322075 -0.020043144466917573


24/05/14 21:05:20 WARN Instrumentation: [ee6a1543] regParam is zero, which might cause numerical instability and overfitting.


NVIDIA False 3.254319067465494 -0.03674772912942226


24/05/14 21:05:23 WARN Instrumentation: [12ffd5bf] regParam is zero, which might cause numerical instability and overfitting.


NVIDIA True 3.254015843086086 -0.0365545381518233


## SVM

In [140]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [141]:
train_data,test_data = split_data_for_ml("Apple",spark=spark,emotion=True,classification=True)

In [142]:
def fit_svm(stock,emotion,spark):

    train_data,test_data = split_data_for_ml(stock,spark=spark,emotion=emotion,classification=True)
    

    svm = LinearSVC(featuresCol="features", labelCol="label",maxIter=200)
    svm_model = svm.fit(train_data)
    
    # Step 6: Evaluate Model
    predictions = svm_model.transform(test_data)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    print(f"{stock},{str(emotion)}, Accuracy on test data:, {accuracy}")

In [143]:
for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        # model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        # results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        # key = stock + "_emotion_" + str(emotion)

        fit_svm(stock,emotion,spark)
        
        




24/05/14 21:05:28 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 
24/05/14 21:05:29 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


Apple,False, Accuracy on test data:, 0.5109289617486339
Apple,True, Accuracy on test data:, 0.5109289617486339
NVIDIA,False, Accuracy on test data:, 0.574585635359116
NVIDIA,True, Accuracy on test data:, 0.574585635359116


In [None]:
0,1

In [144]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def fit_forest(stock,emotion,spark):

    train_data,test_data = split_data_for_ml(stock,spark=spark,emotion=emotion,classification=True)
    
    # Step 5: Train Random Forest Model
    rf = RandomForestClassifier(featuresCol="features", labelCol="label")
    rf_model = rf.fit(train_data)
    
    # Step 6: Evaluate Model
    predictions = rf_model.transform(test_data)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    print(f"{stock},{str(emotion)}, Accuracy on test data:, {accuracy}")




In [145]:
for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        # model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        # results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        # key = stock + "_emotion_" + str(emotion)

        fit_forest(stock,emotion,spark)
        
        




Apple,False, Accuracy on test data:, 0.5136612021857924
Apple,True, Accuracy on test data:, 0.5218579234972678
NVIDIA,False, Accuracy on test data:, 0.5386740331491713
NVIDIA,True, Accuracy on test data:, 0.569060773480663


In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import GBTClassifier

# Prepare training data
data = [(0.0, 0.0, 0.0), (1.0, 1.0, 1.0), (1.0, 2.0, 0.5), (0.0, 0.5, 2.0)]
df = spark.createDataFrame(data, ["label", "feature1", "feature2"])

# Assemble features into a vector
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
data = assembler.transform(df)

# Split data into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Train the Gradient Boosted Trees model
gbt = GBTClassifier(maxDepth=3, maxBins=32)
model = gbt.fit(train_data)

# Make predictions and evaluate the model
predictions = model.transform(test_data)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())
print("Accuracy = " + str(accuracy))