In [1]:
spark

In [2]:
type(spark)

pyspark.sql.session.SparkSession

In [7]:
movies = spark.read.format("csv").option("header", True).load("/data/movielens/movies.csv")
movies.show(10)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+
only showing top 10 rows



In [9]:
type(movies)

pyspark.sql.dataframe.DataFrame

In [8]:
movies.dtypes

[('movieId', 'string'), ('title', 'string'), ('genres', 'string')]

In [12]:
movies = (spark
.read
.format("csv")
.option("header", True)
.option("inferSchema", True)
.load("/data/movielens/movies.csv"))

movies.dtypes

[('movieId', 'int'), ('title', 'string'), ('genres', 'string')]

In [14]:
movies.show(5, False)

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows



In [15]:
ratings = (spark
.read
.format("csv")
.option("header", True)
.option("inferSchema", True)
.load("/data/movielens/ratings.csv"))

ratings.dtypes

[('userId', 'int'),
 ('movieId', 'int'),
 ('rating', 'double'),
 ('timestamp', 'int')]

In [16]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     16|   4.0|1217897793|
|     1|     24|   1.5|1217895807|
|     1|     32|   4.0|1217896246|
|     1|     47|   4.0|1217896556|
|     1|     50|   4.0|1217896523|
|     1|    110|   4.0|1217896150|
|     1|    150|   3.0|1217895940|
|     1|    161|   4.0|1217897864|
|     1|    165|   3.0|1217897135|
|     1|    204|   0.5|1217895786|
|     1|    223|   4.0|1217897795|
|     1|    256|   0.5|1217895764|
|     1|    260|   4.5|1217895864|
|     1|    261|   1.5|1217895750|
|     1|    277|   0.5|1217895772|
|     1|    296|   4.0|1217896125|
|     1|    318|   4.0|1217895860|
|     1|    349|   4.5|1217897058|
|     1|    356|   3.0|1217896231|
|     1|    377|   2.5|1217896373|
+------+-------+------+----------+
only showing top 20 rows



In [19]:
from pyspark.sql.functions import *

In [50]:
(ratings
 .groupBy("movieId")
 .agg(avg("rating").alias("avg_rating"), count("rating").alias("rating_count"))
 .filter("rating_count>=100")
 .alias("t1")
 .join(movies.alias("t2"), col("t1.movieId") == col("t2.movieId"))
 .orderBy(desc("avg_rating"))
 .select("t1.movieId", "title", "avg_rating")
 .limit(10)
).show(10, False)

+-------+--------------------------------------+------------------+
|movieId|title                                 |avg_rating        |
+-------+--------------------------------------+------------------+
|318    |Shawshank Redemption, The (1994)      |4.454545454545454 |
|858    |Godfather, The (1972)                 |4.392857142857143 |
|50     |Usual Suspects, The (1995)            |4.328947368421052 |
|1136   |Monty Python and the Holy Grail (1975)|4.3019480519480515|
|527    |Schindler's List (1993)               |4.296370967741935 |
|1193   |One Flew Over the Cuckoo's Nest (1975)|4.2727272727272725|
|608    |Fargo (1996)                          |4.2711442786069655|
|2571   |Matrix, The (1999)                    |4.264367816091954 |
|1221   |Godfather: Part II, The (1974)        |4.260714285714286 |
|1213   |Goodfellas (1990)                     |4.2592592592592595|
+-------+--------------------------------------+------------------+



In [28]:
sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [29]:
movies.createOrReplaceTempView("movies")
ratings.createOrReplaceTempView("ratings")


In [30]:
sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   movies|       true|
|        |  ratings|       true|
+--------+---------+-----------+



In [51]:
agg = sql("""
select 
    ratings.movieId, avg(rating) avg_rating, title
from ratings 
join movies on ratings.movieId = movies.movieId
group by ratings.movieId, title
having count(*) >=100
order by avg_rating desc
limit 10
""")

agg.show(10, False)

+-------+------------------+--------------------------------------+
|movieId|avg_rating        |title                                 |
+-------+------------------+--------------------------------------+
|318    |4.454545454545454 |Shawshank Redemption, The (1994)      |
|858    |4.392857142857143 |Godfather, The (1972)                 |
|50     |4.328947368421052 |Usual Suspects, The (1995)            |
|1136   |4.3019480519480515|Monty Python and the Holy Grail (1975)|
|527    |4.296370967741935 |Schindler's List (1993)               |
|1193   |4.2727272727272725|One Flew Over the Cuckoo's Nest (1975)|
|608    |4.2711442786069655|Fargo (1996)                          |
|2571   |4.264367816091954 |Matrix, The (1999)                    |
|1221   |4.260714285714286 |Godfather: Part II, The (1974)        |
|1213   |4.2592592592592595|Goodfellas (1990)                     |
+-------+------------------+--------------------------------------+



In [52]:
agg.write.format("csv").save("/tmp/movies_agg")

In [54]:
agg.write.format("json").mode("overwrite").save("/tmp/movies_agg")

In [57]:
kdd = spark.read.format("csv").load("/data/kddcup.data")

In [59]:
columns = [f.split(":")[0] for f in """
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
""".split("\n") if len(f)>0]

columns.append("Category")
print(columns)

['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Category']


In [62]:
kdd = kdd.toDF(*columns)

In [65]:
kdd.count()

4898431

In [64]:
kdd.cache()

DataFrame[duration: string, protocol_type: string, service: string, flag: string, src_bytes: string, dst_bytes: string, land: string, wrong_fragment: string, urgent: string, hot: string, num_failed_logins: string, logged_in: string, num_compromised: string, root_shell: string, su_attempted: string, num_root: string, num_file_creations: string, num_shells: string, num_access_files: string, num_outbound_cmds: string, is_host_login: string, is_guest_login: string, count: string, srv_count: string, serror_rate: string, srv_serror_rate: string, rerror_rate: string, srv_rerror_rate: string, same_srv_rate: string, diff_srv_rate: string, srv_diff_host_rate: string, dst_host_count: string, dst_host_srv_count: string, dst_host_same_srv_rate: string, dst_host_diff_srv_rate: string, dst_host_same_src_port_rate: string, dst_host_srv_diff_host_rate: string, dst_host_serror_rate: string, dst_host_srv_serror_rate: string, dst_host_rerror_rate: string, dst_host_srv_rerror_rate: string, Category: string

In [69]:
import pandas as pd

In [70]:
pd.options.display.max_columns = 1000

In [71]:
kdd.limit(10).toPandas()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Category
0,0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.
5,0,tcp,http,SF,238,1282,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5,5,1.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,normal.
6,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,6,1.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,normal.
7,0,tcp,http,SF,234,1364,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7,7,1.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0,normal.
8,0,tcp,http,SF,239,1295,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,7,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8,8,1.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,normal.
9,0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.


In [74]:
kdd.groupBy("Category").count().orderBy(desc("count")).show()

+----------------+-------+
|        Category|  count|
+----------------+-------+
|          smurf.|2807886|
|        neptune.|1072017|
|         normal.| 972781|
|          satan.|  15892|
|        ipsweep.|  12481|
|      portsweep.|  10413|
|           nmap.|   2316|
|           back.|   2203|
|    warezclient.|   1020|
|       teardrop.|    979|
|            pod.|    264|
|   guess_passwd.|     53|
|buffer_overflow.|     30|
|           land.|     21|
|    warezmaster.|     20|
|           imap.|     12|
|        rootkit.|     10|
|     loadmodule.|      9|
|      ftp_write.|      8|
|       multihop.|      7|
+----------------+-------+
only showing top 20 rows



In [76]:
kdd.groupBy("Category").count().toPandas().sort_values("count", ascending = False)

Unnamed: 0,Category,count
1,smurf.,2807886
19,neptune.,1072017
22,normal.,972781
8,satan.,15892
6,ipsweep.,12481
7,portsweep.,10413
4,nmap.,2316
20,back.,2203
14,warezclient.,1020
15,teardrop.,979


In [77]:
kdd.dtypes

[('duration', 'string'),
 ('protocol_type', 'string'),
 ('service', 'string'),
 ('flag', 'string'),
 ('src_bytes', 'string'),
 ('dst_bytes', 'string'),
 ('land', 'string'),
 ('wrong_fragment', 'string'),
 ('urgent', 'string'),
 ('hot', 'string'),
 ('num_failed_logins', 'string'),
 ('logged_in', 'string'),
 ('num_compromised', 'string'),
 ('root_shell', 'string'),
 ('su_attempted', 'string'),
 ('num_root', 'string'),
 ('num_file_creations', 'string'),
 ('num_shells', 'string'),
 ('num_access_files', 'string'),
 ('num_outbound_cmds', 'string'),
 ('is_host_login', 'string'),
 ('is_guest_login', 'string'),
 ('count', 'string'),
 ('srv_count', 'string'),
 ('serror_rate', 'string'),
 ('srv_serror_rate', 'string'),
 ('rerror_rate', 'string'),
 ('srv_rerror_rate', 'string'),
 ('same_srv_rate', 'string'),
 ('diff_srv_rate', 'string'),
 ('srv_diff_host_rate', 'string'),
 ('dst_host_count', 'string'),
 ('dst_host_srv_count', 'string'),
 ('dst_host_same_srv_rate', 'string'),
 ('dst_host_diff_srv_r

In [79]:
kdd.unpersist()

DataFrame[duration: string, protocol_type: string, service: string, flag: string, src_bytes: string, dst_bytes: string, land: string, wrong_fragment: string, urgent: string, hot: string, num_failed_logins: string, logged_in: string, num_compromised: string, root_shell: string, su_attempted: string, num_root: string, num_file_creations: string, num_shells: string, num_access_files: string, num_outbound_cmds: string, is_host_login: string, is_guest_login: string, count: string, srv_count: string, serror_rate: string, srv_serror_rate: string, rerror_rate: string, srv_rerror_rate: string, same_srv_rate: string, diff_srv_rate: string, srv_diff_host_rate: string, dst_host_count: string, dst_host_srv_count: string, dst_host_same_srv_rate: string, dst_host_diff_srv_rate: string, dst_host_same_src_port_rate: string, dst_host_srv_diff_host_rate: string, dst_host_serror_rate: string, dst_host_srv_serror_rate: string, dst_host_rerror_rate: string, dst_host_srv_rerror_rate: string, Category: string

In [81]:
kdd = spark.read.format("csv").option("inferSchema", True).load("/data/kddcup.data").toDF(*columns).cache()

In [82]:
kdd.count()

4898431

In [83]:
kdd.dtypes

[('duration', 'int'),
 ('protocol_type', 'string'),
 ('service', 'string'),
 ('flag', 'string'),
 ('src_bytes', 'int'),
 ('dst_bytes', 'int'),
 ('land', 'int'),
 ('wrong_fragment', 'int'),
 ('urgent', 'int'),
 ('hot', 'int'),
 ('num_failed_logins', 'int'),
 ('logged_in', 'int'),
 ('num_compromised', 'int'),
 ('root_shell', 'int'),
 ('su_attempted', 'int'),
 ('num_root', 'int'),
 ('num_file_creations', 'int'),
 ('num_shells', 'int'),
 ('num_access_files', 'int'),
 ('num_outbound_cmds', 'int'),
 ('is_host_login', 'int'),
 ('is_guest_login', 'int'),
 ('count', 'int'),
 ('srv_count', 'int'),
 ('serror_rate', 'double'),
 ('srv_serror_rate', 'double'),
 ('rerror_rate', 'double'),
 ('srv_rerror_rate', 'double'),
 ('same_srv_rate', 'double'),
 ('diff_srv_rate', 'double'),
 ('srv_diff_host_rate', 'double'),
 ('dst_host_count', 'int'),
 ('dst_host_srv_count', 'int'),
 ('dst_host_same_srv_rate', 'double'),
 ('dst_host_diff_srv_rate', 'double'),
 ('dst_host_same_src_port_rate', 'double'),
 ('dst_h

In [84]:
numeric_columns = [t[0] for t in kdd.dtypes if t[1] in {"double", "int"}]
numeric_columns

['duration',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

In [87]:
kdd.select(*numeric_columns).limit(10).toPandas()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
3,0,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0
4,0,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
5,0,238,1282,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5,5,1.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
6,0,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,6,1.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0
7,0,234,1364,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7,7,1.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0
8,0,239,1295,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,7,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8,8,1.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0
9,0,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0


In [105]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline, PipelineModel
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [100]:
pipeline = Pipeline()
pipeline.setStages([
    VectorAssembler(inputCols=numeric_columns, outputCol="features"),
    StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True),
    KMeans().setK(30).setSeed(1)
])

Pipeline_4fc1936dbc0d7a18e20a

In [101]:
pipelineModel = pipeline.fit(kdd)

In [106]:
predictions = pipelineModel.transform(kdd)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9995193923401818


In [107]:
evaluator.explainParams()

'featuresCol: features column name. (default: features)\nmetricName: metric name in evaluation (silhouette) (default: silhouette)\npredictionCol: prediction column name. (default: prediction)'