# Airfoil Self-Noise Prediction using PySpark

This project uses the NASA Airfoil Self-Noise dataset to predict the sound pressure level based on several input features using Apache Spark.

In [None]:
!pip install pyspark==3.1.2 -q
!pip install findspark -q

In [None]:
import warnings
warnings.filterwarnings('ignore')

import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
    .appName("AirfoilNoisePrediction") \
    .getOrCreate()

In [None]:
import os
os.makedirs("/tmp", exist_ok=True)

In [None]:
from pyspark.sql.types import StructType, StructField, DoubleType

# Define schema manually
schema = StructType([
    StructField("Frequency", DoubleType(), True),
    StructField("Angle_of_attack", DoubleType(), True),
    StructField("Chord_length", DoubleType(), True),
    StructField("Free_stream_velocity", DoubleType(), True),
    StructField("Suction_thickness", DoubleType(), True),
    StructField("Sound_pressure_level", DoubleType(), True)
])

# Load the data
file_path = "/tmp/NASA_airfoil_noise_raw.csv"
df = spark.read.csv(file_path, schema=schema, header=False)
df.show(5)

In [None]:
df.printSchema()
df.describe().show()

## Feature Engineering and Vector Assembling

In [None]:
from pyspark.ml.feature import VectorAssembler

# Select features and label
feature_cols = [
    "Frequency", "Angle_of_attack", "Chord_length",
    "Free_stream_velocity", "Suction_thickness"
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(df).select("features", "Sound_pressure_level")
data.show(5)

## Train-Test Split and Model Training

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="Sound_pressure_level")
model = lr.fit(train_data)

## Model Evaluation

In [None]:
predictions = model.transform(test_data)

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="Sound_pressure_level",
    predictionCol="prediction",
    metricName="rmse"
)

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

In [None]:
print("Model Coefficients:", model.coefficients)
print("Model Intercept:", model.intercept)

## Save Final Predictions

In [None]:
predictions.select("prediction", "Sound_pressure_level").show(10)