### PySpark Intro

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=43b2bdb5cad17058bbcb1d260d00c61a222769e4a1ecbba21bc8e77d10fc6701
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Tutorial').getOrCreate()

In [6]:
spark

In [7]:
df_spark = spark.read.option('header','true').csv('Iris.csv')

In [8]:
df_spark.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [9]:
df_spark.printSchema()

root
 |-- Id: string (nullable = true)
 |-- SepalLengthCm: string (nullable = true)
 |-- SepalWidthCm: string (nullable = true)
 |-- PetalLengthCm: string (nullable = true)
 |-- PetalWidthCm: string (nullable = true)
 |-- Species: string (nullable = true)



### PySpark Dataframes

In [10]:
df_spark = spark.read.option('header','true').csv('Iris.csv', inferSchema=True)

In [11]:
df_spark.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [12]:
df_spark.columns

['Id',
 'SepalLengthCm',
 'SepalWidthCm',
 'PetalLengthCm',
 'PetalWidthCm',
 'Species']

In [13]:
df_spark.head(3)

[Row(Id=1, SepalLengthCm=5.1, SepalWidthCm=3.5, PetalLengthCm=1.4, PetalWidthCm=0.2, Species='Iris-setosa'),
 Row(Id=2, SepalLengthCm=4.9, SepalWidthCm=3.0, PetalLengthCm=1.4, PetalWidthCm=0.2, Species='Iris-setosa'),
 Row(Id=3, SepalLengthCm=4.7, SepalWidthCm=3.2, PetalLengthCm=1.3, PetalWidthCm=0.2, Species='Iris-setosa')]

In [16]:
df_spark.select('Species').show()

+-----------+
|    Species|
+-----------+
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
+-----------+
only showing top 20 rows



In [17]:
df_spark.select('ID', 'Species').show()

+---+-----------+
| ID|    Species|
+---+-----------+
|  1|Iris-setosa|
|  2|Iris-setosa|
|  3|Iris-setosa|
|  4|Iris-setosa|
|  5|Iris-setosa|
|  6|Iris-setosa|
|  7|Iris-setosa|
|  8|Iris-setosa|
|  9|Iris-setosa|
| 10|Iris-setosa|
| 11|Iris-setosa|
| 12|Iris-setosa|
| 13|Iris-setosa|
| 14|Iris-setosa|
| 15|Iris-setosa|
| 16|Iris-setosa|
| 17|Iris-setosa|
| 18|Iris-setosa|
| 19|Iris-setosa|
| 20|Iris-setosa|
+---+-----------+
only showing top 20 rows



In [19]:
df_spark.describe().show()

+-------+------------------+------------------+-------------------+------------------+------------------+--------------+
|summary|                Id|     SepalLengthCm|       SepalWidthCm|     PetalLengthCm|      PetalWidthCm|       Species|
+-------+------------------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|               150|                150|               150|               150|           150|
|   mean|              75.5| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|          NULL|
| stddev|43.445367992456916|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|          NULL|
|    min|                 1|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    max|               150|               7.9|                4.4|               6.9|               2.5|Iris-virginica|
+-------+------------------+----

In [23]:
df = df_spark.withColumn('new species', df_spark['Species']+'new')

In [25]:
df = df.drop('new species')

In [27]:
df.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [26]:
df.withColumnRenamed('ID','new ID').show()

+------+-------------+------------+-------------+------------+-----------+
|new ID|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+------+-------------+------------+-------------+------------+-----------+
|     1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|     2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|     3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|     4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|     5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|     6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|     7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|     8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|     9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
|    10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
|    11|          5.4|   

### Handling Missing Values

In [30]:
df.drop('ID').show()

+-------------+------------+-------------+------------+-----------+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+-------------+------------+-------------+------------+-----------+
|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
|          5.4|         3.7|          1.5|         0.2|Iris-setosa|
|          4.8|         3.4|          1.6|      

In [37]:
df.na.drop().show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [33]:
df.na.drop(how='all').show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [38]:
df.na.drop(how='any',thresh=2).show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [40]:
df.na.drop(how='any',subset=['PetalLengthCm']).show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [41]:
df.na.fill('Missing Value').show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [42]:
df.na.fill('Missing Value','Species').show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [46]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Id','SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'],
    outputCols=['{}_imputed'.format(c) for c in ['Id','SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'
    ]]
).setStrategy('mean')

In [47]:
imputer.fit(df).transform(df).show()

+---+-------------+------------+-------------+------------+-----------+----------+---------------------+--------------------+---------------------+--------------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|Id_imputed|SepalLengthCm_imputed|SepalWidthCm_imputed|PetalLengthCm_imputed|PetalWidthCm_imputed|
+---+-------------+------------+-------------+------------+-----------+----------+---------------------+--------------------+---------------------+--------------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|         1|                  5.1|                 3.5|                  1.4|                 0.2|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|         2|                  4.9|                 3.0|                  1.4|                 0.2|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|         3|                  4.7|                 3.2|                  1.3|        

### Filter Ops

In [48]:
df.filter('Id>100').show()

+---+-------------+------------+-------------+------------+--------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|       Species|
+---+-------------+------------+-------------+------------+--------------+
|101|          6.3|         3.3|          6.0|         2.5|Iris-virginica|
|102|          5.8|         2.7|          5.1|         1.9|Iris-virginica|
|103|          7.1|         3.0|          5.9|         2.1|Iris-virginica|
|104|          6.3|         2.9|          5.6|         1.8|Iris-virginica|
|105|          6.5|         3.0|          5.8|         2.2|Iris-virginica|
|106|          7.6|         3.0|          6.6|         2.1|Iris-virginica|
|107|          4.9|         2.5|          4.5|         1.7|Iris-virginica|
|108|          7.3|         2.9|          6.3|         1.8|Iris-virginica|
|109|          6.7|         2.5|          5.8|         1.8|Iris-virginica|
|110|          7.2|         3.6|          6.1|         2.5|Iris-virginica|
|111|          6.5|      

In [49]:
df.filter('Id>100').select('SepalLengthCm').show()

+-------------+
|SepalLengthCm|
+-------------+
|          6.3|
|          5.8|
|          7.1|
|          6.3|
|          6.5|
|          7.6|
|          4.9|
|          7.3|
|          6.7|
|          7.2|
|          6.5|
|          6.4|
|          6.8|
|          5.7|
|          5.8|
|          6.4|
|          6.5|
|          7.7|
|          7.7|
|          6.0|
+-------------+
only showing top 20 rows



In [59]:
df.filter((df['Id']>100) & (df['SepalLengthCm']>6.0)).show()

+---+-------------+------------+-------------+------------+--------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|       Species|
+---+-------------+------------+-------------+------------+--------------+
|101|          6.3|         3.3|          6.0|         2.5|Iris-virginica|
|103|          7.1|         3.0|          5.9|         2.1|Iris-virginica|
|104|          6.3|         2.9|          5.6|         1.8|Iris-virginica|
|105|          6.5|         3.0|          5.8|         2.2|Iris-virginica|
|106|          7.6|         3.0|          6.6|         2.1|Iris-virginica|
|108|          7.3|         2.9|          6.3|         1.8|Iris-virginica|
|109|          6.7|         2.5|          5.8|         1.8|Iris-virginica|
|110|          7.2|         3.6|          6.1|         2.5|Iris-virginica|
|111|          6.5|         3.2|          5.1|         2.0|Iris-virginica|
|112|          6.4|         2.7|          5.3|         1.9|Iris-virginica|
|113|          6.8|      

### Group By and Aggregate

In [62]:
df.groupBy('Species').sum().show()

+---------------+-------+------------------+------------------+------------------+------------------+
|        Species|sum(Id)|sum(SepalLengthCm)| sum(SepalWidthCm)|sum(PetalLengthCm)| sum(PetalWidthCm)|
+---------------+-------+------------------+------------------+------------------+------------------+
| Iris-virginica|   6275| 329.3999999999999|             148.7|277.59999999999997|101.29999999999998|
|    Iris-setosa|   1275|250.29999999999998|170.90000000000003|              73.2|12.199999999999996|
|Iris-versicolor|   3775|             296.8|138.50000000000003|212.99999999999997|              66.3|
+---------------+-------+------------------+------------------+------------------+------------------+



In [64]:
df.groupBy(df['Species']).count().show()

+---------------+-----+
|        Species|count|
+---------------+-----+
| Iris-virginica|   50|
|    Iris-setosa|   50|
|Iris-versicolor|   50|
+---------------+-----+



### Mlib

In [71]:
# Define independent features
from pyspark.ml.feature import VectorAssembler
vassem = VectorAssembler(inputCols=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'],outputCol='indp_feats')

In [72]:
output = vassem.transform(df)

In [73]:
output.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|       indp_feats|
+---+-------------+------------+-------------+------------+-----------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|  9|     

In [76]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Species', outputCol='species_lbl')
output_fit = indexer.fit(output).transform(output)

In [77]:
output_fit.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|       indp_feats|species_lbl|
+---+-------------+------------+-------------+------------+-----------+-----------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|        0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|        0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|        0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|        0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|        0.0|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|        0.0|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|[4.6,3.4,1.

In [79]:
fin_data = output_fit.select('indp_feats','species_lbl')

In [80]:
fin_data.show()

+-----------------+-----------+
|       indp_feats|species_lbl|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|        0.0|
|[4.9,3.0,1.4,0.2]|        0.0|
|[4.7,3.2,1.3,0.2]|        0.0|
|[4.6,3.1,1.5,0.2]|        0.0|
|[5.0,3.6,1.4,0.2]|        0.0|
|[5.4,3.9,1.7,0.4]|        0.0|
|[4.6,3.4,1.4,0.3]|        0.0|
|[5.0,3.4,1.5,0.2]|        0.0|
|[4.4,2.9,1.4,0.2]|        0.0|
|[4.9,3.1,1.5,0.1]|        0.0|
|[5.4,3.7,1.5,0.2]|        0.0|
|[4.8,3.4,1.6,0.2]|        0.0|
|[4.8,3.0,1.4,0.1]|        0.0|
|[4.3,3.0,1.1,0.1]|        0.0|
|[5.8,4.0,1.2,0.2]|        0.0|
|[5.7,4.4,1.5,0.4]|        0.0|
|[5.4,3.9,1.3,0.4]|        0.0|
|[5.1,3.5,1.4,0.3]|        0.0|
|[5.7,3.8,1.7,0.3]|        0.0|
|[5.1,3.8,1.5,0.3]|        0.0|
+-----------------+-----------+
only showing top 20 rows



In [82]:
from pyspark.ml.regression import LinearRegression
train, test = fin_data.randomSplit([0.8,0.2])
lr = LinearRegression(featuresCol='indp_feats',labelCol='species_lbl')
lr = lr.fit(train)

In [83]:
lr.coefficients

DenseVector([-0.0117, -0.0944, 0.1945, 0.5878])

In [84]:
lr.intercept

-0.06922477476197456

In [85]:
pred = lr.evaluate(test)

In [86]:
pred.predictions.show()

+-----------------+-----------+--------------------+
|       indp_feats|species_lbl|          prediction|
+-----------------+-----------+--------------------+
|[4.3,3.0,1.1,0.1]|        0.0|-0.13012589141233516|
|[4.5,2.3,1.3,0.3]|        0.0| 0.09008313676437646|
|[4.9,2.5,4.5,1.7]|        2.0|  1.5118380193843406|
|[4.9,3.0,1.4,0.2]|        0.0|-0.02002480713213...|
|[5.0,2.0,3.5,1.0]|        1.0|  0.9519177172071485|
|[5.0,3.5,1.6,0.6]|        0.0| 0.20561026398130447|
|[5.1,2.5,3.0,1.1]|        1.0|  0.8650615571892473|
|[5.1,3.5,1.4,0.3]|        0.0|-0.01079895656579...|
|[5.4,3.4,1.5,0.4]|        0.0| 0.07335749296928437|
|[5.4,3.9,1.3,0.4]|        0.0|-0.01275355303641...|
|[5.5,2.3,4.0,1.3]|        1.0|   1.191323560528746|
|[5.6,2.8,4.9,2.0]|        2.0|  1.7294498627735335|
|[5.8,2.7,5.1,1.9]|        2.0|  1.7166712590073694|
|[5.8,4.0,1.2,0.2]|        0.0|-0.16389015909791865|
|[6.0,2.7,5.1,1.6]|        1.0|   1.537991861471539|
|[6.2,2.2,4.5,1.5]|        1.0|  1.40737509427

In [87]:
pred.meanSquaredError

0.07574525743536356

### Databricks

Created a databricks cluster and performed MultiLinearRegression