In [81]:
import pyspark
import librosa
import math
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator


In [None]:
spark = SparkSession.builder \
    .appName("gradientBoost") \
    .getOrCreate()
spark

25/04/11 19:18:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [83]:
data = spark.read.csv("wavData.csv", header=True, inferSchema=True)
data.show()

+-------+---------------+------------+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+----------+--------------------+-------------------+-------------------+
|frameID|       filename|intelligence|amplitudeEnvelope|       RMSE|                ZCR|      MFCC1|      MFCC2|     MFCC3|     MFCC4|     MFCC5|     MFCC6|     MFCC7|     MFCC8|     MFCC9|    MFCC10|    MFCC11|    MFCC12|    MFCC13|    MFCC14|    MFCC15|     MFCC16|    MFCC17|    MFCC18|    MFCC19|    MFCC20|    spectralCentroid|  spectralBandwidth|    spectralRolloff|
+-------+---------------+------------+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------

In [84]:
data.printSchema()

root
 |-- frameID: integer (nullable = true)
 |-- filename: string (nullable = true)
 |-- intelligence: string (nullable = true)
 |-- amplitudeEnvelope: double (nullable = true)
 |-- RMSE: double (nullable = true)
 |-- ZCR: double (nullable = true)
 |-- MFCC1: double (nullable = true)
 |-- MFCC2: double (nullable = true)
 |-- MFCC3: double (nullable = true)
 |-- MFCC4: double (nullable = true)
 |-- MFCC5: double (nullable = true)
 |-- MFCC6: double (nullable = true)
 |-- MFCC7: double (nullable = true)
 |-- MFCC8: double (nullable = true)
 |-- MFCC9: double (nullable = true)
 |-- MFCC10: double (nullable = true)
 |-- MFCC11: double (nullable = true)
 |-- MFCC12: double (nullable = true)
 |-- MFCC13: double (nullable = true)
 |-- MFCC14: double (nullable = true)
 |-- MFCC15: double (nullable = true)
 |-- MFCC16: double (nullable = true)
 |-- MFCC17: double (nullable = true)
 |-- MFCC18: double (nullable = true)
 |-- MFCC19: double (nullable = true)
 |-- MFCC20: double (nullable = true)


In [85]:
#grab and process categorical columns 
categorical_column = data.columns[2] #intelligence column 

feature_columns = [
    'amplitudeEnvelope', 'RMSE', 'ZCR',
    'MFCC1', 'MFCC2', 'MFCC3', 'MFCC4', 'MFCC5', 'MFCC6',
    'MFCC7', 'MFCC8', 'MFCC9', 'MFCC10', 'MFCC11', 'MFCC12',
    'MFCC13', 'MFCC14', 'MFCC15', 'MFCC16', 'MFCC17', 'MFCC18',
    'MFCC19', 'MFCC20', 'spectralCentroid', 'spectralBandwidth', 'spectralRolloff'
]

indexer = StringIndexer(inputCol='intelligence', outputCol='label')
vectorassem_stage = VectorAssembler(inputCols=feature_columns, outputCol='features')

In [86]:
#now build pipeline since stages are built
pipeline = Pipeline(stages=[indexer, vectorassem_stage])
pipeline_model = pipeline.fit(data)
f_columns = feature_columns + ['features', 'label']
data_df = pipeline_model.transform(data).select(f_columns)
data_df.show()

+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+----------+--------------------+-------------------+-------------------+--------------------+-----+
|amplitudeEnvelope|       RMSE|                ZCR|      MFCC1|      MFCC2|     MFCC3|     MFCC4|     MFCC5|     MFCC6|     MFCC7|     MFCC8|     MFCC9|    MFCC10|    MFCC11|    MFCC12|    MFCC13|    MFCC14|    MFCC15|     MFCC16|    MFCC17|    MFCC18|    MFCC19|    MFCC20|    spectralCentroid|  spectralBandwidth|    spectralRolloff|            features|label|
+-----------------+-----------+-------------------+-----------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+----------+----------+----------+----------+

In [87]:
#split data into sets 
(training_data, test_data) = data_df.randomSplit([0.8, 0.2], seed=1234)

In [88]:
gradient_boost_t = GBTClassifier(featuresCol='features', labelCol='label')

#parameter grid 
ParamGrid = ParamGridBuilder()\
    .addGrid(gradient_boost_t.maxDepth, [2, 3, 4]) \
    .addGrid(gradient_boost_t.minInfoGain, [0.0, 0.1, 0.2, 0.3]) \
    .addGrid(gradient_boost_t.stepSize, [0.05, 0.1, 0.2, 0.4]) \
    .build()

In [89]:
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction')

In [None]:
#cross validation model 
cv = CrossValidator(estimator=gradient_boost_t, estimatorParamMaps=ParamGrid, evaluator=eval, numFolds=3)
cv_model = cv.fit(data_df) 

In [None]:
#predictions 
#training data 
pred_df = cv_model.transform(training_data)
pred_df.select(feature_columns).show()

In [None]:
#test data
test_df = cv_model.transform(test_data)
test_df.select(feature_columns).show()


In [None]:
#model accuracy
accuracy = eval.evaluate(test_df)
print(f"Model accuracy: {accuracy}")