In [1]:
import findspark 
findspark.init()

In [10]:
from pyspark.sql import  SparkSession
from pyspark.ml.feature import  VectorAssembler, StandardScaler
from pyspark.ml.pipeline import PipelineModel 
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression


In [3]:
spark = SparkSession.builder.appName('Machine Learning task').getOrCreate()

In [4]:
cleaned_data = spark.read.parquet('NASA_airfoil_noise_cleaned.parquet', header = True, inferSchema = True)

In [5]:
rowCount = cleaned_data.count()
print('Total number of rows in the cleaned dataset: ', rowCount)

Total number of rows in the cleaned dataset:  1499


In [6]:
cleaned_data.show(5, truncate = False)

+---------+-------------+-----------+------------------+-----------------------+------------------+
|Frequency|AngleOfAttack|ChordLength|FreeStreamVelocity|SuctionSideDisplacement|SoundLevelDecibels|
+---------+-------------+-----------+------------------+-----------------------+------------------+
|4000     |3.0          |0.3048     |31.7              |0.00529514             |115.608           |
|3150     |2.0          |0.2286     |31.7              |0.00372371             |121.527           |
|2000     |7.3          |0.2286     |31.7              |0.0132672              |115.309           |
|2000     |5.4          |0.1524     |71.3              |0.00401199             |131.111           |
|500      |9.9          |0.1524     |71.3              |0.0193001              |131.279           |
+---------+-------------+-----------+------------------+-----------------------+------------------+
only showing top 5 rows



In [7]:
assembler = VectorAssembler(inputCols= ['Frequency', 'AngleOfAttack', 'ChordLength', 'FreeStreamVelocity', 'SuctionSideDisplacement'], outputCol = 'features')

In [8]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures', withStd = True, withMean = True)

In [9]:
lr = LinearRegression(labelCol = 'SoundLevelDecibels', featuresCol = 'scaledFeatures')

In [11]:
pipeline = Pipeline(stages = [assembler, scaler, lr])

In [12]:
(training_data, testing_data) = cleaned_data.randomSplit([0.7, 0.3], seed = 42)

In [13]:
model = pipeline.fit(training_data)

In [14]:
predictions = model.transform(testing_data)

In [15]:
predictions.show(5, truncate=False)

+---------+-------------+-----------+------------------+-----------------------+------------------+----------------------------------+-----------------------------------------------------------------------------------------------------+------------------+
|Frequency|AngleOfAttack|ChordLength|FreeStreamVelocity|SuctionSideDisplacement|SoundLevelDecibels|features                          |scaledFeatures                                                                                       |prediction        |
+---------+-------------+-----------+------------------+-----------------------+------------------+----------------------------------+-----------------------------------------------------------------------------------------------------+------------------+
|200      |7.3          |0.2286     |31.7              |0.0132672              |128.679           |[200.0,7.3,0.2286,31.7,0.0132672] |[-0.8492530428836075,0.08701196802079204,0.9780751308754603,-1.2352001929432386,0.1617088383377834

In [16]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol = 'SoundLevelDecibels', metricName='mse')
mse = evaluator.evaluate(predictions)
print("MSE: ", mse)

MSE:  24.997666255020174


In [17]:
evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'SoundLevelDecibels', metricName='mae')
mae = evaluator.evaluate(predictions)
print('MAE: ', mae)

MAE:  3.913679095881314


In [18]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol = 'SoundLevelDecibels', metricName='r2')
r2 = evaluator.evaluate(predictions)
print('R2 Score: ', r2)

R2 Score:  0.4959688408975431


In [19]:
model.write().overwrite().save('project_2_model')

In [20]:
spark.stop()