## SVM, Linear Regression, Random Forest, Gradient Boosting,

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [4]:
spark = SparkSession.builder.appName("training").getOrCreate()

24/05/15 15:57:00 WARN Utils: Your hostname, alber-victus resolves to a loopback address: 127.0.1.1; using 10.251.208.55 instead (on interface wlp4s0)
24/05/15 15:57:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/15 15:57:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/15 15:57:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [26]:
def split_data_for_ml(stock=None,df=None,train_size=0.8,spark=None,emotion=False,classification=False):



    if df is None:
        
        schema = StructType([
    
        StructField('date',StringType(),True),
        StructField('afinn_sentiment',FloatType(),True),
        StructField('pnn_sentiment',FloatType(),True),
        StructField('price_percent_change',FloatType(),True),
        StructField('volume_percent_change',FloatType(),True),
        StructField('next_day_price_percent_change_shifted',FloatType(),True)
        
        ])
        
        # 'date', 'afinn_sentiment', 'pnn_sentiment', 'price_percent_change', 'volume_percent_change', 'next_day_price_percent_change_shifted'
        df = spark.read.schema(schema).csv("../data/csv/"+stock+"/")
    
        # scale volume

        # scale volumne 

    df = df.sort(df["date"])
    assembler = VectorAssembler(inputCols=['volume_percent_change'], outputCol="features")

    # Transform the data
    data = assembler.transform(df)
    
    # Initialize the StandardScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaled_volume_percent_change", withMean=True, withStd=True)
    
    # Compute summary statistics by fitting the StandardScaler
    scaler_model = scaler.fit(data)
    
    # Scale features
    scaled_data = scaler_model.transform(data)
    
    firstelement = F.udf(lambda v:float(v[0]),FloatType())
    df = scaled_data.withColumn("volume_percent_change", firstelement("scaled_volume_percent_change"))
    
    df = df.select('date',
     'afinn_sentiment',
     'pnn_sentiment',
     'price_percent_change',
     'volume_percent_change',
     'next_day_price_percent_change_shifted').withColumnRenamed("next_day_price_percent_change_shifted","label")


    # return df
    
    n = df.count()
    train_size = int(n*train_size)

    train_data = df.limit(train_size)
    test_data = df.subtract(train_data)

    if emotion:

        
        assembler = VectorAssembler(inputCols=['afinn_sentiment', 'pnn_sentiment', 'price_percent_change', 'volume_percent_change'], \
                                    outputCol="features")

    else:
      
        assembler = VectorAssembler(inputCols=['price_percent_change', 'volume_percent_change'], \
                                    outputCol="features")


    train_data = Pipeline(stages=[assembler]).fit(train_data).transform(train_data)
    test_data = Pipeline(stages=[assembler]).fit(test_data).transform(test_data)

    
    # X_train = train_data.select("features")
    # y_train = train_data.select("label")

    # y_train_cat = y_train.withColumn("label", \
    #                                  F.when(F.col("label") >0, 1).otherwise(0))
    # X_test = test_data.select("features")
    # y_test = test_data.select("label")

    # y_test_cat = y_test.withColumn("label", \
    #                                  F.when(F.col("label") >0, 1).otherwise(0))

    train_data = train_data.select("features","label")
    test_data = test_data.select("features","label")

    if classification:
        train_data = train_data.withColumn("label", \
                                     F.when(F.col("label") >0, 1).otherwise(0))
        
        test_data = test_data.withColumn("label", \
                                     F.when(F.col("label") >0, 1).otherwise(0))
        
    return train_data,test_data
    

In [6]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import StringIndexer
from pyspark.mllib.tree import RandomForest, LabeledPoint


In [7]:
# train_rdd = train_data.rdd.map(lambda x: Labeled<Point(x[-1],x[:-1]))


## Linear regression

In [6]:
def fit_lr(stock,emotion,spark):

    train_data,test_data = split_data_for_ml(stock,spark=spark,emotion=emotion,classification=False)
    
    ln_model = LinearRegression(maxIter=1000)
    lr_model = ln_model.fit(train_data)
    
    # Step 6: Evaluate Model
    predictions = lr_model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)

    print(stock,str(emotion),rmse,r2)


In [7]:
for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        # model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        # results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        # key = stock + "_emotion_" + str(emotion)

        fit_lr(stock,emotion,spark)
        
        




24/05/15 13:48:05 WARN Instrumentation: [92284651] regParam is zero, which might cause numerical instability and overfitting.
24/05/15 13:48:06 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Apple False 1.5498124398532114 -0.020396215477227697


24/05/15 13:48:10 WARN Instrumentation: [02b8aaa8] regParam is zero, which might cause numerical instability and overfitting.


Apple True 1.5495442885322075 -0.020043144466917573


24/05/15 13:48:13 WARN Instrumentation: [e727771c] regParam is zero, which might cause numerical instability and overfitting.


NVIDIA False 3.254319067465494 -0.03674772912942226


24/05/15 13:48:16 WARN Instrumentation: [38e394cc] regParam is zero, which might cause numerical instability and overfitting.


NVIDIA True 3.254015843086086 -0.0365545381518233


## SVM

In [16]:
from pyspark.ml.classification import LinearSVC,LinearSVCModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [9]:
stocks = []
emotions = []
accuracies = []
models = []

In [10]:
train_data,test_data = split_data_for_ml("Apple",spark=spark,emotion=True,classification=True)

In [29]:
def fit_svm(stock,emotion,spark):

    train_data,test_data = split_data_for_ml(stock,spark=spark,emotion=emotion,classification=True)


    svm = LinearSVC(featuresCol="features", labelCol="label",maxIter=200)
    svm_model = svm.fit(train_data)
    
    # Step 6: Evaluate Model
    predictions = svm_model.transform(test_data)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    print(f"{stock},{str(emotion)}, Accuracy on test data:, {accuracy}")

    return accuracy,svm_model

In [30]:
for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        # model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        # results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        # key = stock + "_emotion_" + str(emotion)

        accuracy,model = fit_svm(stock,emotion,spark)

        # model_path = "../models/" + "_binary_smv"+stock+ "_emotion_" + str(emotion)
        # model.write().overwrite().save(model_path)

        models.append("SVM")
        stocks.append(stock)
        emotions.append(str(emotion))
        accuracies.append(accuracy)
        




24/05/15 16:45:03 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 
24/05/15 16:45:03 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


Apple,False, Accuracy on test data:, 0.5109289617486339
Apple,True, Accuracy on test data:, 0.5109289617486339
NVIDIA,False, Accuracy on test data:, 0.574585635359116
NVIDIA,True, Accuracy on test data:, 0.574585635359116


In [62]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def fit_forest(stock,emotion,spark):

    train_data,test_data = split_data_for_ml(stock,spark=spark,emotion=emotion,classification=True)
    
    # Step 5: Train Random Forest Model
    rf = RandomForestClassifier(featuresCol="features", labelCol="label")
    rf_model = rf.fit(train_data)
    
    # Step 6: Evaluate Model
    predictions = rf_model.transform(test_data)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    print(f"{stock},{str(emotion)}, Accuracy on test data:, {accuracy}")

    return accuracy



In [63]:
for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        # model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        # results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        # key = stock + "_emotion_" + str(emotion)

        accuracy = fit_forest(stock,emotion,spark)

        models.append("RandomForest")
        stocks.append(stock)
        emotions.append(str(emotion))
        accuracies.append(accuracy)
        




Apple,False, Accuracy on test data:, 0.5245901639344263
Apple,True, Accuracy on test data:, 0.5081967213114754
NVIDIA,False, Accuracy on test data:, 0.505524861878453
NVIDIA,True, Accuracy on test data:, 0.569060773480663


In [64]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def fit_gb(stock,emotion,spark):

    train_data,test_data = split_data_for_ml(stock,spark=spark,emotion=emotion,classification=True)
    
    gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=100)
    
    # Train the model
    gbt_model = gbt.fit(train_data)
    
    # Make predictions on the test data
    predictions = gbt_model.transform(test_data)
    
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(labelCol="label")
    accuracy = evaluator.evaluate(predictions)
    print(f"{stock},{str(emotion)}, Accuracy on test data:, {accuracy}")
    
    # Stop SparkSession
    return accuracy


In [66]:
for stock in ['Apple','NVIDIA']:


    for emotion in [False,True]:

        # model_path = "../models/" + "_binary_"+ stock + "_emotion_" + str(emotion) + ".h5"
        # results_path = "../results/" +"_binary_"+ stock + "_emotion_" + str(emotion) + ".json"

        # key = stock + "_emotion_" + str(emotion)

        accuracy = fit_gb(stock,emotion,spark)

        
        models.append("GradientBoosting")
        stocks.append(stock)
        emotions.append(str(emotion))
        accuracies.append(accuracy)



Apple,False, Accuracy on test data:, 0.5041974128401995
Apple,True, Accuracy on test data:, 0.523586173931228
NVIDIA,False, Accuracy on test data:, 0.5159996253746254
NVIDIA,True, Accuracy on test data:, 0.5127684815184815


In [67]:
accuracies

[0.5109289617486339,
 0.5109289617486339,
 0.574585635359116,
 0.574585635359116,
 0.5245901639344263,
 0.5081967213114754,
 0.505524861878453,
 0.569060773480663,
 0.5041974128401995,
 0.523586173931228,
 0.5159996253746254,
 0.5127684815184815]

In [70]:
lists = [(models[i],stocks[i],emotions[i],accuracies[i]) for i in range(len(accuracies))]

In [93]:
lists

[('SVM', 'Apple', 'False', 0.5109289617486339),
 ('SVM', 'Apple', 'True', 0.5109289617486339),
 ('SVM', 'NVIDIA', 'False', 0.574585635359116),
 ('SVM', 'NVIDIA', 'True', 0.574585635359116),
 ('RandomForest', 'Apple', 'False', 0.5245901639344263),
 ('RandomForest', 'Apple', 'True', 0.5081967213114754),
 ('RandomForest', 'NVIDIA', 'False', 0.505524861878453),
 ('RandomForest', 'NVIDIA', 'True', 0.569060773480663),
 ('GradientBoosting', 'Apple', 'False', 0.5041974128401995),
 ('GradientBoosting', 'Apple', 'True', 0.523586173931228),
 ('GradientBoosting', 'NVIDIA', 'False', 0.5159996253746254),
 ('GradientBoosting', 'NVIDIA', 'True', 0.5127684815184815)]

In [94]:
schema = StructType([

    StructField('model',StringType(),True),
    StructField('stock',StringType(),True),
    StructField('emotion',StringType(),True),
    StructField('accuracy',FloatType(),True),
    
    ])

In [95]:
df = spark.createDataFrame(lists,schema=schema)

In [96]:
df.show()

+----------------+------+-------+----------+
|           model| stock|emotion|  accuracy|
+----------------+------+-------+----------+
|             SVM| Apple|  False|  0.510929|
|             SVM| Apple|   True|  0.510929|
|             SVM|NVIDIA|  False| 0.5745856|
|             SVM|NVIDIA|   True| 0.5745856|
|    RandomForest| Apple|  False|0.52459013|
|    RandomForest| Apple|   True| 0.5081967|
|    RandomForest|NVIDIA|  False| 0.5055249|
|    RandomForest|NVIDIA|   True| 0.5690608|
|GradientBoosting| Apple|  False| 0.5041974|
|GradientBoosting| Apple|   True|0.52358615|
|GradientBoosting|NVIDIA|  False| 0.5159996|
|GradientBoosting|NVIDIA|   True| 0.5127685|
+----------------+------+-------+----------+



In [75]:
df.toPandas().to_csv("../results/ml_classificatons.csv",index=False)

In [77]:
train_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.86353588104248...|    1|
|[1.28221988677978...|    1|
|[0.97755247354507...|    1|
|[1.20787966251373...|    1|
|[1.33455538749694...|    1|
|[1.17594683170318...|    0|
|[0.72665840387344...|    0|
|[1.90191984176635...|    1|
|[1.43675780296325...|    0|
|[1.26209092140197...|    0|
|[1.17231535911560...|    1|
|[1.74015557765960...|    1|
|[0.88406747579574...|    0|
|[0.68886607885360...|    1|
|[1.45449090003967...|    1|
|[0.55309116840362...|    1|
|[1.34638464450836...|    0|
|[0.48672020435333...|    0|
|[1.31998777389526...|    1|
|[1.43784916400909...|    0|
+--------------------+-----+
only showing top 20 rows



In [78]:
    schema = StructType([

    StructField('date',StringType(),True),
    StructField('afinn_sentiment',FloatType(),True),
    StructField('pnn_sentiment',FloatType(),True),
    StructField('price_percent_change',FloatType(),True),
    StructField('volume_percent_change',FloatType(),True),
    StructField('next_day_price_percent_change_shifted',FloatType(),True)
    
    ])
    
    # 'date', 'afinn_sentiment', 'pnn_sentiment', 'price_percent_change', 'volume_percent_change', 'next_day_price_percent_change_shifted'
    df = spark.read.schema(schema).csv("../data/csv/"+stock+"/")

In [82]:
df = df.withColumnRenamed("next_day_price_percent_change_shifted","next_day_price_percent_change")
df = df.withColumn("label",F.col)

+----------+---------------+-------------+--------------------+---------------------+-----------------------------+
|      date|afinn_sentiment|pnn_sentiment|price_percent_change|volume_percent_change|next_day_price_percent_change|
+----------+---------------+-------------+--------------------+---------------------+-----------------------------+
|2017-01-04|      0.9344968|          1.0|           2.3331146|           -20.158243|                   -2.5385644|
|2017-01-05|      1.1022971|    0.9130435|          -2.5385644|           -17.921982|                    1.3367225|
|2017-01-06|      0.8313522|          1.0|           1.3367225|            -16.40157|                     4.054336|
|2017-01-09|      1.1039267|          1.0|            4.054336|            11.349738|                  -0.75503397|
|2017-01-10|     0.95401067|          0.6|         -0.75503397|           -3.8557246|                   -1.2303843|
|2017-01-11|      1.0971123|          1.0|          -1.2303843|         