In [12]:
import pyspark
import librosa
import math
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

In [13]:
spark = SparkSession.builder \
    .appName("LogisticRegression") \
    .getOrCreate()
spark

In [14]:
data = spark.read.csv("wavData.csv", header=True, inferSchema=True)
data.show()

+-------+---------------+------------+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+----------+--------------------+-------------------+-------------------+
|frameID|       filename|intelligence|amplitudeEnvelope|       RMSE|                ZCR|      MFCC1|      MFCC2|     MFCC3|     MFCC4|     MFCC5|     MFCC6|     MFCC7|     MFCC8|     MFCC9|    MFCC10|    MFCC11|    MFCC12|    MFCC13|    MFCC14|    MFCC15|     MFCC16|    MFCC17|    MFCC18|    MFCC19|    MFCC20|    spectralCentroid|  spectralBandwidth|    spectralRolloff|
+-------+---------------+------------+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------

In [15]:
#grab and process categorical columns 
categorical_column = data.columns[2] #intelligence column 

feature_columns = [
    'amplitudeEnvelope', 'RMSE', 'ZCR',
    'MFCC1', 'MFCC2', 'MFCC3', 'MFCC4', 'MFCC5', 'MFCC6',
    'MFCC7', 'MFCC8', 'MFCC9', 'MFCC10', 'MFCC11', 'MFCC12',
    'MFCC13', 'MFCC14', 'MFCC15', 'MFCC16', 'MFCC17', 'MFCC18',
    'MFCC19', 'MFCC20', 'spectralCentroid', 'spectralBandwidth', 'spectralRolloff'
]

indexer = StringIndexer(inputCol='intelligence', outputCol='label')

In [16]:
vectorassem_stage = VectorAssembler(inputCols=feature_columns, outputCol='features')

In [17]:
#now build pipeline since stages are built
pipeline = Pipeline(stages=[indexer, vectorassem_stage])

In [18]:
#fit pipeline 
pipeline_model = pipeline.fit(data)

In [19]:
f_columns = feature_columns + ['features', 'label']
data_df = pipeline_model.transform(data).select(f_columns)
data_df.show()

+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+----------+--------------------+-------------------+-------------------+--------------------+-----+
|amplitudeEnvelope|       RMSE|                ZCR|      MFCC1|      MFCC2|     MFCC3|     MFCC4|     MFCC5|     MFCC6|     MFCC7|     MFCC8|     MFCC9|    MFCC10|    MFCC11|    MFCC12|    MFCC13|    MFCC14|    MFCC15|     MFCC16|    MFCC17|    MFCC18|    MFCC19|    MFCC20|    spectralCentroid|  spectralBandwidth|    spectralRolloff|            features|label|
+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+----------+

In [20]:
#split data into sets 
(training_data, test_data) = data_df.randomSplit([0.8, 0.2], seed=1234)

In [21]:
#apply the Logistic regression model
LR = LogisticRegression(maxIter=10, featuresCol='features', labelCol='label')
ParamGrid = ParamGridBuilder()\
    .addGrid(LR.regParam, [0,0.5,1, 2]) \
    .addGrid(LR.elasticNetParam, [0.0, 0.5,1])\
    .build()

eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')

In [22]:
#cross validation model 
cv = CrossValidator(estimator=LR, estimatorParamMaps=ParamGrid, evaluator=eval, numFolds=3)
cv_model = cv.fit(data_df)

25/04/11 18:29:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

In [25]:
#predictions 
#training data 
pred_df = cv_model.transform(training_data)
pred_df.select(feature_columns).show()

#test data
test_df = cv_model.transform(test_data)
test_df.select(feature_columns).show()

+-----------------+------------+--------------------+------------+----------+----------+----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-------------------+--------------------+-------------------+
|amplitudeEnvelope|        RMSE|                 ZCR|       MFCC1|     MFCC2|     MFCC3|     MFCC4|      MFCC5|     MFCC6|     MFCC7|     MFCC8|     MFCC9|    MFCC10|    MFCC11|    MFCC12|    MFCC13|    MFCC14|    MFCC15|    MFCC16|    MFCC17|    MFCC18|    MFCC19|    MFCC20|   spectralCentroid|   spectralBandwidth|    spectralRolloff|
+-----------------+------------+--------------------+------------+----------+----------+----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-------------------+--------------------+-------

In [26]:
#predictions
#test data
test_df = cv_model.transform(test_data)
test_df.select(feature_columns).show()

+-----------------+------------+--------------------+------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+--------------------+--------------------+--------------------+
|amplitudeEnvelope|        RMSE|                 ZCR|       MFCC1|     MFCC2|     MFCC3|     MFCC4|     MFCC5|     MFCC6|     MFCC7|     MFCC8|     MFCC9|    MFCC10|    MFCC11|    MFCC12|    MFCC13|    MFCC14|    MFCC15|    MFCC16|    MFCC17|    MFCC18|    MFCC19|    MFCC20|    spectralCentroid|   spectralBandwidth|     spectralRolloff|
+-----------------+------------+--------------------+------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+--------------------+--------------------+-----

In [27]:
#model accuracy
accuracy = eval.evaluate(test_df)
print(f"Model accuracy: {accuracy}")

Model accuracy: 0.7151057874515404
