In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/airfoil-selfnoise-dataset/AirfoilSelfNoise.csv


In [39]:
!pip install pyspark



In [40]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [41]:
spark = SparkSession.builder.getOrCreate()

In [42]:
spark

In [43]:
df = spark.read.csv("/kaggle/input/airfoil-selfnoise-dataset",header=True,inferSchema='True')

In [44]:
df.printSchema()

root
 |-- f: integer (nullable = true)
 |-- alpha: double (nullable = true)
 |-- c: double (nullable = true)
 |-- U_infinity: double (nullable = true)
 |-- delta: double (nullable = true)
 |-- SSPL: double (nullable = true)



In [45]:
df.show(2)

+----+-----+------+----------+----------+-------+
|   f|alpha|     c|U_infinity|     delta|   SSPL|
+----+-----+------+----------+----------+-------+
| 800|  0.0|0.3048|      71.3|0.00266337|126.201|
|1000|  0.0|0.3048|      71.3|0.00266337|125.201|
+----+-----+------+----------+----------+-------+
only showing top 2 rows



In [46]:
#train and test split
train,test = df.randomSplit([0.7,0.3])

In [47]:
print("train",train.count())
print("test",test.count())

train 1060
test 443


In [48]:
train.show(2)
test.show(2)

+---+-----+------+----------+---------+-------+
|  f|alpha|     c|U_infinity|    delta|   SSPL|
+---+-----+------+----------+---------+-------+
|200|  7.3|0.2286|      31.7|0.0132672|128.679|
|200|  7.3|0.2286|      71.3|0.0104404|138.758|
+---+-----+------+----------+---------+-------+
only showing top 2 rows

+---+-----+------+----------+----------+-------+
|  f|alpha|     c|U_infinity|     delta|   SSPL|
+---+-----+------+----------+----------+-------+
|200|  0.0|0.3048|      31.7|0.00331266|117.195|
|200|  0.0|0.3048|      39.6|0.00310138|118.129|
+---+-----+------+----------+----------+-------+
only showing top 2 rows



In [49]:
#Identifying the categorical and numerical columns
cat_cols = [x for x, dataType in train.dtypes if dataType=="String"]
num_cols = [x for x, dataType in train.dtypes if dataType!="String"]

In [50]:
print(cat_cols)

[]


In [51]:
print(num_cols)

['f', 'alpha', 'c', 'U_infinity', 'delta', 'SSPL']


In [52]:
from pyspark.ml.feature import VectorAssembler

In [53]:
assembler_input = [x for x in num_cols]

In [54]:
assembler_input

['f', 'alpha', 'c', 'U_infinity', 'delta', 'SSPL']

In [55]:
vector_assembler = VectorAssembler(inputCols=assembler_input, outputCol="VectorAssemblerFeature")

In [56]:
stages=[]
stages += [vector_assembler]

In [57]:
stages

[VectorAssembler_90c8b48847b9]

In [58]:
%%time
from pyspark.ml import Pipeline

pipeline = Pipeline().setStages(stages)
model = pipeline.fit(train)
transformed = model.transform(test)


CPU times: user 2.11 ms, sys: 2.75 ms, total: 4.86 ms
Wall time: 56.1 ms


In [59]:
pp_df = model.transform(test)

In [60]:
pp_df.select('f', 'alpha', 'c', 'U_infinity', 'delta', 'SSPL',"VectorAssemblerFeature")

DataFrame[f: int, alpha: double, c: double, U_infinity: double, delta: double, SSPL: double, VectorAssemblerFeature: vector]

In [61]:
pp_df.show(truncate=False)

+---+-----+------+----------+----------+-------+------------------------------------------+
|f  |alpha|c     |U_infinity|delta     |SSPL   |VectorAssemblerFeature                    |
+---+-----+------+----------+----------+-------+------------------------------------------+
|200|0.0  |0.3048|31.7      |0.00331266|117.195|[200.0,0.0,0.3048,31.7,0.00331266,117.195]|
|200|0.0  |0.3048|39.6      |0.00310138|118.129|[200.0,0.0,0.3048,39.6,0.00310138,118.129]|
|200|7.3  |0.2286|39.6      |0.0123481 |130.989|[200.0,7.3,0.2286,39.6,0.0123481,130.989] |
|200|7.3  |0.2286|55.5      |0.0111706 |135.234|[200.0,7.3,0.2286,55.5,0.0111706,135.234] |
|200|8.9  |0.1016|71.3      |0.0103088 |133.503|[200.0,8.9,0.1016,71.3,0.0103088,133.503] |
|200|9.5  |0.0254|31.7      |0.00461377|119.146|[200.0,9.5,0.0254,31.7,0.00461377,119.146]|
|200|12.3 |0.1016|31.7      |0.0418756 |124.987|[200.0,12.3,0.1016,31.7,0.0418756,124.987]|
|200|12.6 |0.1524|39.6      |0.0584113 |114.75 |[200.0,12.6,0.1524,39.6,0.058411

In [62]:
from pyspark.ml.regression import LinearRegression

In [63]:
data = pp_df.select(F.col("VectorAssemblerFeature").alias("features"),
                    F.col("alpha").alias("label"),
                    )

In [64]:
data.show(5,truncate=False)

+------------------------------------------+-----+
|features                                  |label|
+------------------------------------------+-----+
|[200.0,0.0,0.3048,31.7,0.00331266,117.195]|0.0  |
|[200.0,0.0,0.3048,39.6,0.00310138,118.129]|0.0  |
|[200.0,7.3,0.2286,39.6,0.0123481,130.989] |7.3  |
|[200.0,7.3,0.2286,55.5,0.0111706,135.234] |7.3  |
|[200.0,8.9,0.1016,71.3,0.0103088,133.503] |8.9  |
+------------------------------------------+-----+
only showing top 5 rows



In [65]:
%%time
model = LinearRegression().fit(data)

23/11/20 13:41:45 WARN Instrumentation: [2eea4873] regParam is zero, which might cause numerical instability and overfitting.


CPU times: user 20 ms, sys: 6.23 ms, total: 26.3 ms
Wall time: 577 ms


In [66]:
print("MAE",model.summary.meanAbsoluteError)

MAE 3.096936518048418e-15


In [67]:
print("MSE",model.summary.meanSquaredError)

MSE 1.9556993635228837e-29


In [68]:
print("r2 score",model.summary.r2)

r2 score 1.0


In [69]:
print("RMSE",model.summary.rootMeanSquaredError)

RMSE 4.422328983152298e-15


In [72]:
#save the model
model.save("lin_model")

In [73]:
#load the model
loded_model = model.load("lin_model")